Joan Giner commited on
Commit
f87e387
1 Parent(s): 460caa6

upgraded openai and langchain versions

Browse files
Files changed (3) hide show
  1. app.py +6 -4
  2. requirements.txt +18 -7
  3. src/extractor.py +22 -21
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import openai
2
  import gradio as gr
3
- from langchain.embeddings import OpenAIEmbeddings
4
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
  from langchain.vectorstores.faiss import FAISS
6
  from langchain.chains.question_answering import load_qa_chain
7
  from langchain.chains import LLMChain
8
- from langchain.llms import OpenAI
9
- from langchain import PromptTemplate
10
  from langchain.docstore.document import Document
11
  import pandas as pd
12
  import os
@@ -24,7 +24,7 @@ load_dotenv()
24
  #openai.api_key=os.getenv("OPEN_AI_API_KEY")
25
  #LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
26
  extractor = Extractor()
27
-
28
  # Define function to handle the Gradio interface
29
  async def extraction(input_file, apikey, dimension):
30
  # Build the chains
@@ -55,6 +55,8 @@ async def ui_extraction(input_file, apikey, dimension):
55
  raise gr.Error("Please upload a data paper")
56
  if (input_file.name.split(".")[-1] != "pdf"):
57
  raise gr.Error("This is not a data paper!, please upload it in .pdf format")
 
 
58
  file_name = input_file.name.split("/")[-1]
59
  results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
60
  # Build results in the correct format for the Gradio front-end
 
1
  import openai
2
  import gradio as gr
3
+ from langchain_openai import OpenAIEmbeddings
4
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
  from langchain.vectorstores.faiss import FAISS
6
  from langchain.chains.question_answering import load_qa_chain
7
  from langchain.chains import LLMChain
8
+ from langchain_community.llms import OpenAI
9
+ from langchain.prompts import PromptTemplate
10
  from langchain.docstore.document import Document
11
  import pandas as pd
12
  import os
 
24
  #openai.api_key=os.getenv("OPEN_AI_API_KEY")
25
  #LLMClient = OpenAI(model_name='text-davinci-003', openai_api_key=openai.api_key,temperature=0)
26
  extractor = Extractor()
27
+ print(os.getenv("OPEN_AI_API_KEY"))
28
  # Define function to handle the Gradio interface
29
  async def extraction(input_file, apikey, dimension):
30
  # Build the chains
 
55
  raise gr.Error("Please upload a data paper")
56
  if (input_file.name.split(".")[-1] != "pdf"):
57
  raise gr.Error("This is not a data paper!, please upload it in .pdf format")
58
+ if (len(apikey) == 0):
59
+ raise gr.Error("Please inform your OpenAI Apikey")
60
  file_name = input_file.name.split("/")[-1]
61
  results, completeness_report = await extractor.extraction(file_name, input_file.name, apikey, dimension)
62
  # Build results in the correct format for the Gradio front-end
requirements.txt CHANGED
@@ -33,14 +33,22 @@ gradio==3.32.0
33
  gradio_client==0.2.5
34
  h11==0.14.0
35
  httpcore==0.17.2
 
36
  httpx==0.24.1
37
  huggingface-hub==0.14.1
38
  idna==3.4
 
39
  Jinja2==3.1.2
 
 
40
  jsonschema==4.17.3
41
  kiwisolver==1.4.4
42
- langchain==0.0.186
 
 
 
43
  langcodes==3.3.0
 
44
  linkify-it-py==2.0.2
45
  lxml==4.9.2
46
  markdown-it-py==2.2.0
@@ -60,10 +68,10 @@ necessary==0.4.2
60
  networkx==3.1
61
  numexpr==2.8.4
62
  numpy==1.24.3
63
- openai==0.27.7
64
  openapi-schema-pydantic==1.2.4
65
  orjson==3.8.14
66
- packaging==23.1
67
  pandas==1.5.3
68
  pathy==0.10.1
69
  pdf2image==1.16.3
@@ -86,7 +94,7 @@ PyYAML==6.0
86
  regex==2023.5.5
87
  requests==2.31.0
88
  requirements-parser==0.5.0
89
- scipdf @ git+https://github.com/titipata/scipdf_parser@master
90
  semantic-version==2.10.0
91
  six==1.16.0
92
  smart-open==6.3.0
@@ -103,7 +111,7 @@ tabula-py==2.7.0
103
  tenacity==8.2.2
104
  textstat==0.7.3
105
  thinc==8.1.10
106
- tiktoken==0.4.0
107
  tokenizers==0.13.3
108
  toolz==0.12.0
109
  torch==2.0.1
@@ -112,11 +120,14 @@ transformers==4.29.2
112
  typer==0.7.0
113
  types-setuptools==67.8.0.0
114
  typing-inspect==0.9.0
115
- typing_extensions==4.6.2
116
  uc-micro-py==1.0.2
117
- urllib3==2.0.2
118
  uvicorn==0.22.0
 
119
  Wand==0.6.11
120
  wasabi==1.1.1
 
121
  websockets==11.0.3
122
  yarl==1.9.2
 
 
33
  gradio_client==0.2.5
34
  h11==0.14.0
35
  httpcore==0.17.2
36
+ httptools==0.5.0
37
  httpx==0.24.1
38
  huggingface-hub==0.14.1
39
  idna==3.4
40
+ importlib-resources==6.1.1
41
  Jinja2==3.1.2
42
+ jsonpatch==1.33
43
+ jsonpointer==2.4
44
  jsonschema==4.17.3
45
  kiwisolver==1.4.4
46
+ langchain==0.1.2
47
+ langchain-community==0.0.14
48
+ langchain-core==0.1.14
49
+ langchain-openai==0.0.3
50
  langcodes==3.3.0
51
+ langsmith==0.0.83
52
  linkify-it-py==2.0.2
53
  lxml==4.9.2
54
  markdown-it-py==2.2.0
 
68
  networkx==3.1
69
  numexpr==2.8.4
70
  numpy==1.24.3
71
+ openai==1.9.0
72
  openapi-schema-pydantic==1.2.4
73
  orjson==3.8.14
74
+ packaging==23.2
75
  pandas==1.5.3
76
  pathy==0.10.1
77
  pdf2image==1.16.3
 
94
  regex==2023.5.5
95
  requests==2.31.0
96
  requirements-parser==0.5.0
97
+ scipdf==0.1.dev0
98
  semantic-version==2.10.0
99
  six==1.16.0
100
  smart-open==6.3.0
 
111
  tenacity==8.2.2
112
  textstat==0.7.3
113
  thinc==8.1.10
114
+ tiktoken==0.5.2
115
  tokenizers==0.13.3
116
  toolz==0.12.0
117
  torch==2.0.1
 
120
  typer==0.7.0
121
  types-setuptools==67.8.0.0
122
  typing-inspect==0.9.0
123
+ typing_extensions==4.9.0
124
  uc-micro-py==1.0.2
125
+ urllib3==1.26.6
126
  uvicorn==0.22.0
127
+ uvloop==0.17.0
128
  Wand==0.6.11
129
  wasabi==1.1.1
130
+ watchfiles==0.19.0
131
  websockets==11.0.3
132
  yarl==1.9.2
133
+ zipp==3.17.0
src/extractor.py CHANGED
@@ -1,12 +1,14 @@
1
  import openai
2
  import gradio as gr
3
- from langchain.embeddings import OpenAIEmbeddings
 
4
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
  from langchain.vectorstores.faiss import FAISS
6
  from langchain.chains.question_answering import load_qa_chain
7
  from langchain.chains import LLMChain
8
- from langchain.llms import OpenAI
9
- from langchain import PromptTemplate
 
10
  from langchain.docstore.document import Document
11
  import pandas as pd
12
  import os
@@ -65,11 +67,9 @@ class Extractor:
65
 
66
  # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
67
  def extract_text_from_pdf(self, file_path):
68
- try:
69
- article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
70
- print("PDF parsed")
71
- except:
72
- raise gr.Error("Error parsing PDF, please update your data paper in the correct format")
73
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
74
  for section in article_dict['sections']:
75
  sec = section['heading'] + ": "
@@ -95,7 +95,7 @@ class Extractor:
95
  #table_texts.append(query + " "+ result['text'])
96
  table_texts = await asyncio.gather(*table_texts)
97
  for table in table_texts:
98
- docsearch.add_texts(table[1])
99
  return docsearch
100
 
101
  def extract_text_clean(self, file_name, file_path):
@@ -111,9 +111,8 @@ class Extractor:
111
  async def prepare_data(self, file_name, file_path, chain_table, apikey):
112
  # Process text and get the embeddings
113
  vectorspath = "./vectors/"+file_name
114
- if not apikey:
115
  #apikey = openai.api_key
116
- raise gr.Error("Please set your api key")
117
  embeddings = OpenAIEmbeddings(openai_api_key=apikey)
118
  if os.path.isfile(vectorspath+"/index.faiss"):
119
 
@@ -145,17 +144,19 @@ class Extractor:
145
 
146
  # Save the index locally
147
  FAISS.save_local(docsearch, "./vectors/"+file_name)
 
 
 
 
 
 
 
148
 
149
  return docsearch
150
 
151
  def build_chains(self, apikey):
152
- if not apikey:
153
- #apikey = openai.api_key
154
- raise gr.Error("Please set your Api key")
155
- try:
156
- LLMClient = OpenAI(model_name='text-davinci-003',openai_api_key=apikey,temperature=0)
157
- except:
158
- raise gr.Error("Your Api key is not valid")
159
  ## In-context prompt
160
  prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
161
  Question: {question}
@@ -192,14 +193,14 @@ class Extractor:
192
 
193
  async def async_table_generate(self, docs,table,chain):
194
 
195
- resp = await chain.arun({"context": docs, "table": table})
196
  #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
197
  return resp
198
 
199
  async def async_generate(self, dimension, docs,question,chain):
200
- resp = await chain.arun({"input_documents": docs, "question": question})
201
  #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
202
- return [dimension, resp]
203
 
204
  async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
205
  dimensions = [
 
1
  import openai
2
  import gradio as gr
3
+ #from langchain.embeddings import OpenAIEmbeddings
4
+ from langchain_openai import OpenAIEmbeddings
5
  from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
6
  from langchain.vectorstores.faiss import FAISS
7
  from langchain.chains.question_answering import load_qa_chain
8
  from langchain.chains import LLMChain
9
+ from langchain_community.llms import OpenAI
10
+ #from langchain import PromptTemplate
11
+ from langchain.prompts import PromptTemplate
12
  from langchain.docstore.document import Document
13
  import pandas as pd
14
  import os
 
67
 
68
  # Extract text from PDF file using SCIPDF and Gorbid service (you need gorbid to use it)
69
  def extract_text_from_pdf(self, file_path):
70
+
71
+ article_dict = scipdf.parse_pdf_to_dict(file_path, soup=True,return_coordinates=False, grobid_url="https://kermitt2-grobid.hf.space") # return dictionary
72
+ print("PDF parsed")
 
 
73
  finaltext = article_dict['title'] + " \n\n " + article_dict['authors'] + " \n\n Abstract: " + article_dict['abstract'] + " \n\n "
74
  for section in article_dict['sections']:
75
  sec = section['heading'] + ": "
 
95
  #table_texts.append(query + " "+ result['text'])
96
  table_texts = await asyncio.gather(*table_texts)
97
  for table in table_texts:
98
+ docsearch.add_texts(table)
99
  return docsearch
100
 
101
  def extract_text_clean(self, file_name, file_path):
 
111
  async def prepare_data(self, file_name, file_path, chain_table, apikey):
112
  # Process text and get the embeddings
113
  vectorspath = "./vectors/"+file_name
114
+
115
  #apikey = openai.api_key
 
116
  embeddings = OpenAIEmbeddings(openai_api_key=apikey)
117
  if os.path.isfile(vectorspath+"/index.faiss"):
118
 
 
144
 
145
  # Save the index locally
146
  FAISS.save_local(docsearch, "./vectors/"+file_name)
147
+
148
+ try:
149
+ result = docsearch.similarity_search("trial query")
150
+ except Exception as e:
151
+ print(e)
152
+ raise gr.Error("Your OpenAI Apikey is not valid")
153
+
154
 
155
  return docsearch
156
 
157
  def build_chains(self, apikey):
158
+ LLMClient = OpenAI(model_name='gpt-3.5-turbo-instruct',openai_api_key=apikey,temperature=0)
159
+
 
 
 
 
 
160
  ## In-context prompt
161
  prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
162
  Question: {question}
 
193
 
194
  async def async_table_generate(self, docs,table,chain):
195
 
196
+ resp = await chain.ainvoke({"context": docs, "table": table})
197
  #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
198
  return resp
199
 
200
  async def async_generate(self, dimension, docs,question,chain):
201
+ resp = await chain.ainvoke({"input_documents": docs, "question": question})
202
  #resp = "Description of the team, the type, and the demographics information, Description of the team, the type, and the demographics information"
203
+ return [dimension, resp['output_text']]
204
 
205
  async def get_gathering_dimension(self, docsearch, incontext_prompt, retrieved_docs):
206
  dimensions = [