Spaces:

JoanGiner
/

DataDoc_Analyzer

Sleeping

App Files Files Community

Joan Giner commited on Jan 23

Commit

f87e387

•

1 Parent(s): 460caa6

upgraded openai and langchain versions

Browse files

Files changed (3) hide show

app.py +6 -4
requirements.txt +18 -7
src/extractor.py +22 -21

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import openai
 import gradio as gr
-from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 from langchain.vectorstores.faiss import FAISS
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains import LLMChain
-from langchain.llms import OpenAI
-from langchain import PromptTemplate
 from langchain.docstore.document import Document
 import pandas as pd
 import os
@@ -24,7 +24,7 @@ load_dotenv()
 #openai.api_key=os.getenv("OPEN_AI_API_KEY")
 #LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
 extractor = Extractor()
 # Define function to handle the Gradio interface
 async def extraction(input_file, apikey, dimension):
     # Build the chains
@@ -55,6 +55,8 @@ async def ui_extraction(input_file, apikey, dimension):
             raise gr.Error("Please upload a data paper")
         if (input_file.name.split(".")[-1] != "pdf"):
             raise gr.Error("This is not a data paper!, please upload it in .pdf format")
         file_name = input_file.name.split("/")[-1]
         results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
         # Build results in the correct format for the Gradio front-end

 import openai
 import gradio as gr
+from langchain_openai import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 from langchain.vectorstores.faiss import FAISS
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains import LLMChain
+from langchain_community.llms import OpenAI
+from langchain.prompts import PromptTemplate
 from langchain.docstore.document import Document
 import pandas as pd
 import os
 #openai.api_key=os.getenv("OPEN_AI_API_KEY")
 #LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
 extractor = Extractor()
+print(os.getenv("OPEN_AI_API_KEY"))
 # Define function to handle the Gradio interface
 async def extraction(input_file, apikey, dimension):
     # Build the chains
             raise gr.Error("Please upload a data paper")
         if (input_file.name.split(".")[-1] != "pdf"):
             raise gr.Error("This is not a data paper!, please upload it in .pdf format")
+        if (len(apikey) == 0):
+          raise gr.Error("Please inform your OpenAI Apikey")
         file_name = input_file.name.split("/")[-1]
         results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
         # Build results in the correct format for the Gradio front-end

requirements.txt CHANGED Viewed

@@ -33,14 +33,22 @@ gradio==3.32.0
 gradio_client==0.2.5
 h11==0.14.0
 httpcore==0.17.2
 httpx==0.24.1
 huggingface-hub==0.14.1
 idna==3.4
 Jinja2==3.1.2
 jsonschema==4.17.3
 kiwisolver==1.4.4
-langchain==0.0.186
 langcodes==3.3.0
 linkify-it-py==2.0.2
 lxml==4.9.2
 markdown-it-py==2.2.0
@@ -60,10 +68,10 @@ necessary==0.4.2
 networkx==3.1
 numexpr==2.8.4
 numpy==1.24.3
-openai==0.27.7
 openapi-schema-pydantic==1.2.4
 orjson==3.8.14
-packaging==23.1
 pandas==1.5.3
 pathy==0.10.1
 pdf2image==1.16.3
@@ -86,7 +94,7 @@ PyYAML==6.0
 regex==2023.5.5
 requests==2.31.0
 requirements-parser==0.5.0
-scipdf @ git+https://github.com/titipata/scipdf_parser@master
 semantic-version==2.10.0
 six==1.16.0
 smart-open==6.3.0
@@ -103,7 +111,7 @@ tabula-py==2.7.0
 tenacity==8.2.2
 textstat==0.7.3
 thinc==8.1.10
-tiktoken==0.4.0
 tokenizers==0.13.3
 toolz==0.12.0
 torch==2.0.1
@@ -112,11 +120,14 @@ transformers==4.29.2
 typer==0.7.0
 types-setuptools==67.8.0.0
 typing-inspect==0.9.0
-typing_extensions==4.6.2
 uc-micro-py==1.0.2
-urllib3==2.0.2
 uvicorn==0.22.0
 Wand==0.6.11
 wasabi==1.1.1
 websockets==11.0.3
 yarl==1.9.2

 gradio_client==0.2.5
 h11==0.14.0
 httpcore==0.17.2
+httptools==0.5.0
 httpx==0.24.1
 huggingface-hub==0.14.1
 idna==3.4
+importlib-resources==6.1.1
 Jinja2==3.1.2
+jsonpatch==1.33
+jsonpointer==2.4
 jsonschema==4.17.3
 kiwisolver==1.4.4
+langchain==0.1.2
+langchain-community==0.0.14
+langchain-core==0.1.14
+langchain-openai==0.0.3
 langcodes==3.3.0
+langsmith==0.0.83
 linkify-it-py==2.0.2
 lxml==4.9.2
 markdown-it-py==2.2.0
 networkx==3.1
 numexpr==2.8.4
 numpy==1.24.3
+openai==1.9.0
 openapi-schema-pydantic==1.2.4
 orjson==3.8.14
+packaging==23.2
 pandas==1.5.3
 pathy==0.10.1
 pdf2image==1.16.3
 regex==2023.5.5
 requests==2.31.0
 requirements-parser==0.5.0
+scipdf==0.1.dev0
 semantic-version==2.10.0
 six==1.16.0
 smart-open==6.3.0
 tenacity==8.2.2
 textstat==0.7.3
 thinc==8.1.10
+tiktoken==0.5.2
 tokenizers==0.13.3
 toolz==0.12.0
 torch==2.0.1
 typer==0.7.0
 types-setuptools==67.8.0.0
 typing-inspect==0.9.0
+typing_extensions==4.9.0
 uc-micro-py==1.0.2
+urllib3==1.26.6
 uvicorn==0.22.0
+uvloop==0.17.0
 Wand==0.6.11
 wasabi==1.1.1
+watchfiles==0.19.0
 websockets==11.0.3
 yarl==1.9.2
+zipp==3.17.0

src/extractor.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import openai
 import gradio as gr
-from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 from langchain.vectorstores.faiss import FAISS
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains import LLMChain
-from langchain.llms import OpenAI
-from langchain import PromptTemplate
 from langchain.docstore.document import Document
 import pandas as pd
 import os
@@ -65,11 +67,9 @@ class Extractor:
     # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
     def extract_text_from_pdf(self, file_path):
-        try:
-            article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
-            print("PDF parsed")
-        except:
-            raise gr.Error("Error parsing PDF, please update your data paper in the correct format")
         finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
         for section in article_dict['sections']:
             sec = section['heading'] + ": "
@@ -95,7 +95,7 @@ class Extractor:
             #table_texts.append(query + " "+ result['text'])
         table_texts = await asyncio.gather(*table_texts)
         for table in table_texts:
-            docsearch.add_texts(table[1])
         return docsearch
     def extract_text_clean(self, file_name, file_path):
@@ -111,9 +111,8 @@ class Extractor:
     async def prepare_data(self, file_name, file_path, chain_table, apikey):
         # Process text and get the embeddings
         vectorspath = "./vectors/"+file_name
-        if not apikey:
             #apikey = openai.api_key
-            raise gr.Error("Please set your api key")
         embeddings = OpenAIEmbeddings(openai_api_key=apikey)
         if os.path.isfile(vectorspath+"/index.faiss"):
@@ -145,17 +144,19 @@ class Extractor:
             # Save the index locally
             FAISS.save_local(docsearch, "./vectors/"+file_name)
         return docsearch
     def build_chains(self, apikey):
-        if not apikey:
-            #apikey = openai.api_key
-            raise gr.Error("Please set your Api key")
-        try:
-            LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
-        except:
-            raise gr.Error("Your Api key is not valid")
         ## In-context prompt
         prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
         Question: {question}
@@ -192,14 +193,14 @@ class Extractor:
     async def async_table_generate(self, docs,table,chain):
-        resp = await chain.arun({"context": docs, "table": table})
         #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
         return resp
     async def async_generate(self, dimension, docs,question,chain):
-        resp = await chain.arun({"input_documents": docs, "question": question})
         #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
-        return [dimension, resp]
     async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
         dimensions = [

 import openai
 import gradio as gr
+#from langchain.embeddings import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
 from langchain.vectorstores.faiss import FAISS
 from langchain.chains.question_answering import load_qa_chain
 from langchain.chains import LLMChain
+from langchain_community.llms import OpenAI
+#from langchain import PromptTemplate
+from langchain.prompts import PromptTemplate
 from langchain.docstore.document import Document
 import pandas as pd
 import os
     # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
     def extract_text_from_pdf(self, file_path):
+        article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
+        print("PDF parsed")
         finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
         for section in article_dict['sections']:
             sec = section['heading'] + ": "
             #table_texts.append(query + " "+ result['text'])
         table_texts = await asyncio.gather(*table_texts)
         for table in table_texts:
+            docsearch.add_texts(table)
         return docsearch
     def extract_text_clean(self, file_name, file_path):
     async def prepare_data(self, file_name, file_path, chain_table, apikey):
         # Process text and get the embeddings
         vectorspath = "./vectors/"+file_name
             #apikey = openai.api_key
         embeddings = OpenAIEmbeddings(openai_api_key=apikey)
         if os.path.isfile(vectorspath+"/index.faiss"):
             # Save the index locally
             FAISS.save_local(docsearch, "./vectors/"+file_name)
+        try:
+            result = docsearch.similarity_search("trial query")
+        except Exception as e:
+            print(e)
+            raise gr.Error("Your OpenAI Apikey is not valid")
         return docsearch
     def build_chains(self, apikey):
+        LLMClient = OpenAI(model_name='gpt-3.5-turbo-instruct',openai_api_key=apikey,temperature=0)
         ## In-context prompt
         prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
         Question: {question}
     async def async_table_generate(self, docs,table,chain):
+        resp = await chain.ainvoke({"context": docs, "table": table})
         #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
         return resp
     async def async_generate(self, dimension, docs,question,chain):
+        resp = await chain.ainvoke({"input_documents": docs, "question": question})
         #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
+        return [dimension, resp['output_text']]
     async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
         dimensions = [