Spaces:

MachineLearningReply
/

q-and-a-tool

Sleeping

App Files Files Community

karshreya98 commited on Oct 19, 2023

Commit

e2fe55a

•

1 Parent(s): 8329090

added upload functionality

Browse files

Files changed (2) hide show

app.py +87 -11
utils/haystack.py +44 -17

app.py CHANGED Viewed

@@ -7,27 +7,103 @@ from annotated_text import annotation
 from json import JSONDecodeError
 from markdown import markdown
 from utils.config import parser
-from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query
-from utils.ui import reset_results, set_initial_state
 try:
     args = parser.parse_args()
-    document_store = start_document_store(type = args.store)
     if args.task == 'extractive':
-        pipeline = start_haystack_extractive(document_store)
     else:
-        pipeline = start_haystack_rag(document_store)
-    set_initial_state()
-    st.write('# '+args.name)
     # Search bar
     question = st.text_input("Ask a question", value=st.session_state.question, max_chars=100, on_change=reset_results)
-    #question = "what is Pi?"
     run_pressed = st.button("Run")
-    #run_pressed = True
     run_query = (
         run_pressed or question != st.session_state.question
@@ -43,7 +119,7 @@ try:
             except JSONDecodeError as je:
                 st.error(
                     "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
-                )
             except Exception as e:
                 logging.exception(e)
                 st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")

 from json import JSONDecodeError
 from markdown import markdown
 from utils.config import parser
+from utils.haystack import start_document_store, start_haystack_extractive, start_haystack_rag, query, start_preprocessor_node, start_retriever, start_reader
+from utils.ui import reset_results, set_initial_state, upload_doc
+# Sliders
+DEFAULT_DOCS_FROM_RETRIEVER = int(os.getenv("DEFAULT_DOCS_FROM_RETRIEVER", "3"))
+DEFAULT_NUMBER_OF_ANSWERS = int(os.getenv("DEFAULT_NUMBER_OF_ANSWERS", "3"))
+# Labels for the evaluation
+#EVAL_LABELS = os.getenv("EVAL_FILE", str(Path(__file__).parent / "eval_labels_volksbank_QA.csv"))
+# Whether the file upload should be enabled or not
+DISABLE_FILE_UPLOAD = bool(os.getenv("DISABLE_FILE_UPLOAD"))
+UPLOAD_DOCUMENTS = []
+# Define a function to handle file uploads
+def upload_files():
+    uploaded_files = st.sidebar.file_uploader(
+            "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
+        )
+    return uploaded_files
+# Define a function to process a single file
+def process_file(data_file, preprocesor, document_store):
+    # read file and add content
+    file_contents = data_file.read()
+    docs = [{
+        'content': str(file_contents),
+        'meta': {'name': str(data_file.name)}
+    }]
+    try:
+        names = [item.meta.get('name') for item in document_store.get_all_documents()]
+        #if args.store == 'inmemory':
+        # doc = converter.convert(file_path=files, meta=None)
+        if data_file.name in names:
+            print(f"{data_file.name} already processed")
+        else:
+            print(f'preprocessing uploaded doc {data_file.name}.......')
+            preprocessed_docs = preprocesor.process(docs)
+            print('writing to document store.......')
+            document_store.write_documents(preprocessed_docs)
+            print('updating emebdding.......')
+            document_store.update_embeddings(retriever)
+    except Exception as e:
+        print(e)
 try:
     args = parser.parse_args()
+    set_initial_state()
+    st.write('# '+args.name)
+    session_state = st.session_state
+    preprocesor = start_preprocessor_node()
+    document_store = start_document_store(args.store)
+    retriever = start_retriever(document_store)
+    reader = start_reader()
     if args.task == 'extractive':
+        pipeline = start_haystack_extractive(document_store, retriever, reader)
     else:
+        pipeline = start_haystack_rag(document_store, retriever)
+    # Sidebar
+    #st.sidebar.header("Options")
+    # File upload block
+    if not DISABLE_FILE_UPLOAD:
+        st.sidebar.write("## File Upload:")
+        #data_files = st.sidebar.file_uploader(
+        #    "upload", type=["pdf", "txt", "docx"], accept_multiple_files=True, label_visibility="hidden"
+        #)
+        data_files = upload_files()
+        if data_files is not None:
+            for data_file in data_files:
+                # Upload file
+                if data_file:
+                    try:
+                        #raw_json = upload_doc(data_file)
+                        # Call the process_file function for each uploaded file
+                        if args.store == 'inmemory':
+                            processed_data = process_file(data_file, preprocesor, document_store)
+                        st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ✅ ")
+                    except Exception as e:
+                        st.sidebar.write(str(data_file.name) + " &nbsp;&nbsp; ❌ ")
+                        st.sidebar.write("_This file could not be parsed, see the logs for more information._")
     # Search bar
     question = st.text_input("Ask a question", value=st.session_state.question, max_chars=100, on_change=reset_results)
+    # question = "what is Pi?"
     run_pressed = st.button("Run")
+    # run_pressed = True
     run_query = (
         run_pressed or question != st.session_state.question
             except JSONDecodeError as je:
                 st.error(
                     "👓 &nbsp;&nbsp; An error occurred reading the results. Is the document store working?"
+                )
             except Exception as e:
                 logging.exception(e)
                 st.error("🐞 &nbsp;&nbsp; An error occurred during the request.")

utils/haystack.py CHANGED Viewed

@@ -5,13 +5,31 @@ from haystack import Pipeline
 from haystack.schema import Answer
 from haystack.document_stores import BaseDocumentStore
 from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
-from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode
 from milvus_haystack import MilvusDocumentStore
 #Use this file to set up your Haystack pipeline and querying
 @st.cache_resource(show_spinner=False)
 def start_document_store(type: str):
     #This function starts the documents store of your choice based on your command line preference
     if type == 'inmemory':
         document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
         documents = [
@@ -24,7 +42,7 @@ def start_document_store(type: str):
                 'meta': {'name': "siemens.txt"}
             },
         ]
-        document_store.write_documents(documents)
     elif type == 'opensearch':
         document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
                                                  username = document_store_configs['OPENSEARCH_USERNAME'],
@@ -45,34 +63,43 @@ def start_document_store(type: str):
                                             return_embedding=True)
     return document_store
-# cached to make index and models load only at start
 @st.cache_resource(show_spinner=False)
-def start_haystack_extractive(_document_store: BaseDocumentStore):
-    retriever = EmbeddingRetriever(document_store=_document_store,
-                                   embedding_model=model_configs['EMBEDDING_MODEL'],
                                    top_k=5)
-    _document_store.update_embeddings(retriever)
     reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
-    pipe = Pipeline()
-    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
-    pipe.add_node(component=reader, name="Reader", inputs=["Retriever"])
     return pipe
 @st.cache_resource(show_spinner=False)
-def start_haystack_rag(_document_store: BaseDocumentStore):
-    retriever = EmbeddingRetriever(document_store=_document_store,
-                                   embedding_model=model_configs['EMBEDDING_MODEL'],
-                                   top_k=5)
-    _document_store.update_embeddings(retriever)
     prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
                              model_name_or_path=model_configs['GENERATIVE_MODEL'],
                              api_key=model_configs['OPENAI_KEY'])
     pipe = Pipeline()
-    pipe.add_node(component=retriever, name="Retriever", inputs=["Query"])
     pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
     return pipe

 from haystack.schema import Answer
 from haystack.document_stores import BaseDocumentStore
 from haystack.document_stores import InMemoryDocumentStore, OpenSearchDocumentStore, WeaviateDocumentStore
+from haystack.nodes import EmbeddingRetriever, FARMReader, PromptNode, PreProcessor
 from milvus_haystack import MilvusDocumentStore
 #Use this file to set up your Haystack pipeline and querying
+@st.cache_resource(show_spinner=False)
+def start_preprocessor_node():
+    print('initializing preprocessor node')
+    processor = PreProcessor(
+        clean_empty_lines=True,
+        clean_whitespace=True,
+        clean_header_footer=True,
+        #remove_substrings=None,
+        split_by="word",
+        split_length=100,
+        split_respect_sentence_boundary=True,
+        #split_overlap=0,
+        #max_chars_check= 10_000
+    )
+    return processor
+    #return docs
 @st.cache_resource(show_spinner=False)
 def start_document_store(type: str):
     #This function starts the documents store of your choice based on your command line preference
+    print('initializing document store')
     if type == 'inmemory':
         document_store = InMemoryDocumentStore(use_bm25=True, embedding_dim=384)
         documents = [
                 'meta': {'name': "siemens.txt"}
             },
         ]
+        #document_store.write_documents(documents)
     elif type == 'opensearch':
         document_store = OpenSearchDocumentStore(scheme = document_store_configs['OPENSEARCH_SCHEME'],
                                                  username = document_store_configs['OPENSEARCH_USERNAME'],
                                             return_embedding=True)
     return document_store
 @st.cache_resource(show_spinner=False)
+def start_retriever(_document_store: BaseDocumentStore):
+    print('initializing retriever')
+    retriever = EmbeddingRetriever(document_store=_document_store,
+                                   embedding_model=model_configs['EMBEDDING_MODEL'],
                                    top_k=5)
+    #
+    #_document_store.update_embeddings(retriever)
+    return retriever
+@st.cache_resource(show_spinner=False)
+def start_reader():
+    print('initializing reader')
     reader = FARMReader(model_name_or_path=model_configs['EXTRACTIVE_MODEL'])
+    return reader
+# cached to make index and models load only at start
+@st.cache_resource(show_spinner=False)
+def start_haystack_extractive(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever, _reader: FARMReader):
+    print('initializing pipeline')
+    pipe = Pipeline()
+    pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
+    pipe.add_node(component= _reader, name="Reader", inputs=["Retriever"])
     return pipe
 @st.cache_resource(show_spinner=False)
+def start_haystack_rag(_document_store: BaseDocumentStore, _retriever: EmbeddingRetriever):
     prompt_node = PromptNode(default_prompt_template="deepset/question-answering",
                              model_name_or_path=model_configs['GENERATIVE_MODEL'],
                              api_key=model_configs['OPENAI_KEY'])
     pipe = Pipeline()
+    pipe.add_node(component=_retriever, name="Retriever", inputs=["Query"])
     pipe.add_node(component=prompt_node, name="PromptNode", inputs=["Retriever"])
     return pipe