karshreya98 commited on
Commit
8b2d8aa
1 Parent(s): 8e3f504

resolving merge conflicts

Browse files
Files changed (3) hide show
  1. app.py +70 -9
  2. utils/haystack.py +48 -18
  3. utils/ui.py +1 -1
app.py CHANGED
@@ -7,14 +7,52 @@ from annotated_text import annotation
7
  from json import JSONDecodeError
8
  from markdown import markdown
9
  from utils.config import parser
10
- from utils.haystack import start_document_store, query, initialize_pipeline
11
  from utils.ui import reset_results, set_initial_state
12
  import pandas as pd
13
  import haystack
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  try:
16
  args = parser.parse_args()
 
17
  document_store = start_document_store(type=args.store)
 
 
18
  st.set_page_config(
19
  page_title="MLReplySearch",
20
  layout="centered",
@@ -42,19 +80,42 @@ try:
42
 
43
  # Check the task and initialize pipeline accordingly
44
  if task_selection == 'Extractive':
45
- pipeline_extractive = initialize_pipeline("extractive", document_store)
46
  elif task_selection == 'Generative' and openai_key: # Check for openai_key to ensure user has entered it
47
- pipeline_rag = initialize_pipeline("rag", document_store, openai_key=openai_key)
 
48
 
49
  set_initial_state()
50
 
51
  st.write('# ' + args.name)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  if "question" not in st.session_state:
54
  st.session_state.question = ""
55
  # Search bar
56
  question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results)
57
-
58
  run_pressed = st.button("Run")
59
 
60
  run_query = (
@@ -73,11 +134,11 @@ try:
73
  except JSONDecodeError as je:
74
  st.error(
75
  "👓    An error occurred reading the results. Is the document store working?"
76
- )
77
  except Exception as e:
78
  logging.exception(e)
79
  st.error("🐞    An error occurred during the request.")
80
-
81
  elif task_selection == 'Generative':
82
  reset_results()
83
  st.session_state.question = question
@@ -88,7 +149,7 @@ try:
88
  except JSONDecodeError as je:
89
  st.error(
90
  "👓    An error occurred reading the results. Is the document store working?"
91
- )
92
  except Exception as e:
93
  if "API key is invalid" in str(e):
94
  logging.exception(e)
@@ -98,11 +159,11 @@ try:
98
  st.error("🐞    An error occurred during the request.")
99
  # Display results
100
  if (st.session_state.results_extractive or st.session_state.results_generative) and run_query:
101
-
102
  # Handle Extractive Answers
103
  if task_selection == 'Extractive':
104
  results = st.session_state.results_extractive
105
-
106
  st.subheader("Extracted Answers:")
107
 
108
  if 'answers' in results:
 
7
  from json import JSONDecodeError
8
  from markdown import markdown
9
  from utils.config import parser
10
+ from utils.haystack import start_document_store, query, initialize_pipeline, start_preprocessor_node, start_retriever, start_reader
11
  from utils.ui import reset_results, set_initial_state
12
  import pandas as pd
13
  import haystack
14
 
15
+ # Whether the file upload should be enabled or not
16
+ DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
17
+ # Define a function to handle file uploads
18
+ def upload_files():
19
+ uploaded_files = st.sidebar.file_uploader(
20
+ "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
21
+ )
22
+ return uploaded_files
23
+
24
+ # Define a function to process a single file
25
+
26
+ def process_file(data_file, preprocesor, document_store):
27
+ # read file and add content
28
+ file_contents = data_file.read().decode("utf-8")
29
+ docs = [{
30
+ 'content': str(file_contents),
31
+ 'meta': {'name': str(data_file.name)}
32
+ }]
33
+ try:
34
+ names = [item.meta.get('name') for item in document_store.get_all_documents()]
35
+ #if args.store == 'inmemory':
36
+ # doc = converter.convert(file_path=files, meta=None)
37
+ if data_file.name in names:
38
+ print(f"{data_file.name} already processed")
39
+ else:
40
+ print(f'preprocessing uploaded doc {data_file.name}.......')
41
+ #print(data_file.read().decode("utf-8"))
42
+ preprocessed_docs = preprocesor.process(docs)
43
+ print('writing to document store.......')
44
+ document_store.write_documents(preprocessed_docs)
45
+ print('updating emebdding.......')
46
+ document_store.update_embeddings(retriever)
47
+ except Exception as e:
48
+ print(e)
49
+
50
  try:
51
  args = parser.parse_args()
52
+ preprocesor = start_preprocessor_node()
53
  document_store = start_document_store(type=args.store)
54
+ retriever = start_retriever(document_store)
55
+ reader = start_reader()
56
  st.set_page_config(
57
  page_title="MLReplySearch",
58
  layout="centered",
 
80
 
81
  # Check the task and initialize pipeline accordingly
82
  if task_selection == 'Extractive':
83
+ pipeline_extractive = initialize_pipeline("extractive", document_store, retriever, reader)
84
  elif task_selection == 'Generative' and openai_key: # Check for openai_key to ensure user has entered it
85
+ pipeline_rag = initialize_pipeline("rag", document_store, retriever, reader, openai_key=openai_key)
86
+
87
 
88
  set_initial_state()
89
 
90
  st.write('# ' + args.name)
91
 
92
+
93
+ # File upload block
94
+ if not DISABLE_FILE_UPLOAD:
95
+ st.sidebar.write("## File Upload:")
96
+ #data_files = st.sidebar.file_uploader(
97
+ # "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
98
+ #)
99
+ data_files = upload_files()
100
+ if data_files is not None:
101
+ for data_file in data_files:
102
+ # Upload file
103
+ if data_file:
104
+ try:
105
+ #raw_json = upload_doc(data_file)
106
+ # Call the process_file function for each uploaded file
107
+ if args.store == 'inmemory':
108
+ processed_data = process_file(data_file, preprocesor, document_store)
109
+ st.sidebar.write(str(data_file.name) + "    ✅ ")
110
+ except Exception as e:
111
+ st.sidebar.write(str(data_file.name) + "    ❌ ")
112
+ st.sidebar.write("_This file could not be parsed, see the logs for more information._")
113
+
114
  if "question" not in st.session_state:
115
  st.session_state.question = ""
116
  # Search bar
117
  question = st.text_input("", value=st.session_state.question, max_chars=100, on_change=reset_results)
118
+
119
  run_pressed = st.button("Run")
120
 
121
  run_query = (
 
134
  except JSONDecodeError as je:
135
  st.error(
136
  "👓    An error occurred reading the results. Is the document store working?"
137
+ )
138
  except Exception as e:
139
  logging.exception(e)
140
  st.error("🐞    An error occurred during the request.")
141
+
142
  elif task_selection == 'Generative':
143
  reset_results()
144
  st.session_state.question = question
 
149
  except JSONDecodeError as je:
150
  st.error(
151
  "👓    An error occurred reading the results. Is the document store working?"
152
+ )
153
  except Exception as e:
154
  if "API key is invalid" in str(e):
155
  logging.exception(e)
 
159
  st.error("🐞    An error occurred during the request.")
160
  # Display results
161
  if (st.session_state.results_extractive or st.session_state.results_generative) and run_query:
162
+
163
  # Handle Extractive Answers
164
  if task_selection == 'Extractive':
165
  results = st.session_state.results_extractive
166
+
167
  st.subheader("Extracted Answers:")
168
 
169
  if 'answers' in results:
utils/haystack.py CHANGED
@@ -5,15 +5,34 @@ from haystack import Pipeline
5
  from haystack.schema import Answer
6
  from haystack.document_stores import BaseDocumentStore
7
  from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
8
- from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode
9
  from milvus_haystack import MilvusDocumentStore
10
  #Use this file to set up your Haystack pipeline and querying
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  @st.cache_resource(show_spinner=False)
13
  def start_document_store(type: str):
14
  #This function starts the documents store of your choice based on your command line preference
 
15
  if type == 'inmemory':
16
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
 
17
  documents = [
18
  {
19
  'content': "Pi is a super dog",
@@ -25,6 +44,7 @@ def start_document_store(type: str):
25
  },
26
  ]
27
  document_store.write_documents(documents)
 
28
  elif type == 'opensearch':
29
  document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
30
  username = document_store_configs['OPENSEARCH_USERNAME'],
@@ -47,32 +67,42 @@ def start_document_store(type: str):
47
 
48
  # cached to make index and models load only at start
49
  @st.cache_resource(show_spinner=False)
50
- def start_haystack_extractive(_document_store: BaseDocumentStore):
51
- retriever = EmbeddingRetriever(document_store=_document_store,
52
- embedding_model=model_configs['EMBEDDING_MODEL'],
 
53
  top_k=5)
54
- _document_store.update_embeddings(retriever)
 
 
 
55
 
 
 
 
 
56
  reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
57
-
58
- pipe = Pipeline()
59
- pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
60
- pipe.add_node(component=reader, name="Reader", inputs=["Retriever"])
61
 
 
 
 
 
 
 
 
 
 
62
  return pipe
63
 
64
  @st.cache_resource(show_spinner=False)
65
- def start_haystack_rag(_document_store: BaseDocumentStore, openai_key):
66
- retriever = EmbeddingRetriever(document_store=_document_store,
67
- embedding_model=model_configs['EMBEDDING_MODEL'],
68
- top_k=5)
69
- _document_store.update_embeddings(retriever)
70
  prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
71
  model_name_or_path=model_configs['GENERATIVE_MODEL'],
72
  api_key=openai_key)
73
  pipe = Pipeline()
74
 
75
- pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
76
  pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
77
 
78
  return pipe
@@ -83,8 +113,8 @@ def query(_pipeline, question):
83
  results = _pipeline.run(question, params=params)
84
  return results
85
 
86
- def initialize_pipeline(task, document_store, openai_key = ""):
87
  if task == 'extractive':
88
- return start_haystack_extractive(document_store)
89
  elif task == 'rag':
90
- return start_haystack_rag(document_store, openai_key)
 
5
  from haystack.schema import Answer
6
  from haystack.document_stores import BaseDocumentStore
7
  from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
8
+ from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
9
  from milvus_haystack import MilvusDocumentStore
10
  #Use this file to set up your Haystack pipeline and querying
11
 
12
+ @st.cache_resource(show_spinner=False)
13
+ def start_preprocessor_node():
14
+ print('initializing preprocessor node')
15
+ processor = PreProcessor(
16
+ clean_empty_lines= True,
17
+ clean_whitespace=True,
18
+ clean_header_footer=True,
19
+ #remove_substrings=None,
20
+ split_by="word",
21
+ split_length=100,
22
+ split_respect_sentence_boundary=True,
23
+ #split_overlap=0,
24
+ #max_chars_check= 10_000
25
+ )
26
+ return processor
27
+ #return docs
28
+
29
  @st.cache_resource(show_spinner=False)
30
  def start_document_store(type: str):
31
  #This function starts the documents store of your choice based on your command line preference
32
+ print('initializing document store')
33
  if type == 'inmemory':
34
  document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
35
+ '''
36
  documents = [
37
  {
38
  'content': "Pi is a super dog",
 
44
  },
45
  ]
46
  document_store.write_documents(documents)
47
+ '''
48
  elif type == 'opensearch':
49
  document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
50
  username = document_store_configs['OPENSEARCH_USERNAME'],
 
67
 
68
  # cached to make index and models load only at start
69
  @st.cache_resource(show_spinner=False)
70
+ def start_retriever(_document_store: BaseDocumentStore):
71
+ print('initializing retriever')
72
+ retriever = EmbeddingRetriever(document_store=_document_store,
73
+ embedding_model=model_configs['EMBEDDING_MODEL'],
74
  top_k=5)
75
+ #
76
+
77
+ #_document_store.update_embeddings(retriever)
78
+ return retriever
79
 
80
+
81
+ @st.cache_resource(show_spinner=False)
82
+ def start_reader():
83
+ print('initializing reader')
84
  reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
85
+ return reader
 
 
 
86
 
87
+
88
+
89
+ # cached to make index and models load only at start
90
+ @st.cache_resource(show_spinner=False)
91
+ def start_haystack_extractive(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, _reader: FARMReader):
92
+ print('initializing pipeline')
93
+ pipe = Pipeline()
94
+ pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
95
+ pipe.add_node(component= _reader, name="Reader", inputs=["Retriever"])
96
  return pipe
97
 
98
  @st.cache_resource(show_spinner=False)
99
+ def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, openai_key):
 
 
 
 
100
  prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
101
  model_name_or_path=model_configs['GENERATIVE_MODEL'],
102
  api_key=openai_key)
103
  pipe = Pipeline()
104
 
105
+ pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
106
  pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
107
 
108
  return pipe
 
113
  results = _pipeline.run(question, params=params)
114
  return results
115
 
116
+ def initialize_pipeline(task, document_store, retriever, reader, openai_key = ""):
117
  if task == 'extractive':
118
+ return start_haystack_extractive(document_store, retriever, reader)
119
  elif task == 'rag':
120
+ return start_haystack_rag(document_store, retriever, openai_key)
utils/ui.py CHANGED
@@ -12,5 +12,5 @@ def set_initial_state():
12
 
13
  def reset_results(*args):
14
  st.session_state.results_extractive = None
15
- st.session_state.results_generative = None
16
  st.session_state.task = None
 
12
 
13
  def reset_results(*args):
14
  st.session_state.results_extractive = None
15
+ st.session_state.results_generative = None
16
  st.session_state.task = None