cheesyFishes commited on
Commit
b0bfa1a
β€’
1 Parent(s): c755c9e

initial commit

Browse files
Files changed (8) hide show
  1. README.md +4 -4
  2. app.py +208 -0
  3. constants.py +105 -0
  4. index.json +0 -0
  5. nyc_wiki.txt +0 -0
  6. requirements.txt +4 -0
  7. terms_definitions_tutorial.md +494 -0
  8. utils.py +15 -0
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
  title: Llama Index Term Definition Demo
3
- emoji: 😻
4
- colorFrom: pink
5
  colorTo: purple
6
  sdk: streamlit
7
- sdk_version: 1.17.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
  ---
12
 
 
1
  ---
2
  title: Llama Index Term Definition Demo
3
+ emoji: πŸ¦™πŸ“š
4
+ colorFrom: blue
5
  colorTo: purple
6
  sdk: streamlit
7
+ sdk_version: 1.19.0
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
11
  ---
12
 
app.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+ from PIL import Image
5
+ from llama_index import (
6
+ Document,
7
+ GPTSimpleVectorIndex,
8
+ GPTListIndex,
9
+ LLMPredictor,
10
+ ServiceContext,
11
+ SimpleDirectoryReader,
12
+ PromptHelper,
13
+ )
14
+ from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR, ImageParser
15
+
16
+ from constants import DEFAULT_TERM_STR, DEFAULT_TERMS, REFINE_TEMPLATE, TEXT_QA_TEMPLATE
17
+ from utils import get_llm
18
+
19
+
20
+ if "all_terms" not in st.session_state:
21
+ st.session_state["all_terms"] = DEFAULT_TERMS
22
+
23
+
24
+ @st.cache_resource
25
+ def get_file_extractor():
26
+ image_parser = ImageParser(keep_image=True, parse_text=True)
27
+ file_extractor = DEFAULT_FILE_EXTRACTOR
28
+ file_extractor.update(
29
+ {
30
+ ".jpg": image_parser,
31
+ ".png": image_parser,
32
+ ".jpeg": image_parser,
33
+ }
34
+ )
35
+
36
+ return file_extractor
37
+
38
+
39
+ file_extractor = get_file_extractor()
40
+
41
+
42
+ def extract_terms(documents, term_extract_str, llm_name, model_temperature, api_key):
43
+ llm = get_llm(llm_name, model_temperature, api_key, max_tokens=1024)
44
+
45
+ service_context = ServiceContext.from_defaults(
46
+ llm_predictor=LLMPredictor(llm=llm),
47
+ prompt_helper=PromptHelper(
48
+ max_input_size=4096, max_chunk_overlap=20, num_output=1024
49
+ ),
50
+ chunk_size_limit=1024,
51
+ )
52
+
53
+ temp_index = GPTListIndex.from_documents(documents, service_context=service_context)
54
+ terms_definitions = str(
55
+ temp_index.query(term_extract_str, response_mode="tree_summarize")
56
+ )
57
+ terms_definitions = [
58
+ x
59
+ for x in terms_definitions.split("\n")
60
+ if x and "Term:" in x and "Definition:" in x
61
+ ]
62
+ # parse the text into a dict
63
+ terms_to_definition = {
64
+ x.split("Definition:")[0]
65
+ .split("Term:")[-1]
66
+ .strip(): x.split("Definition:")[-1]
67
+ .strip()
68
+ for x in terms_definitions
69
+ }
70
+ return terms_to_definition
71
+
72
+
73
+ def insert_terms(terms_to_definition):
74
+ for term, definition in terms_to_definition.items():
75
+ doc = Document(f"Term: {term}\nDefinition: {definition}")
76
+ st.session_state["llama_index"].insert(doc)
77
+
78
+
79
+ @st.cache_resource
80
+ def initialize_index(llm_name, model_temperature, api_key):
81
+ """Create the GPTSQLStructStoreIndex object."""
82
+ llm = get_llm(llm_name, model_temperature, api_key)
83
+
84
+ service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
85
+
86
+ index = GPTSimpleVectorIndex.load_from_disk(
87
+ "./index.json", service_context=service_context
88
+ )
89
+
90
+ return index
91
+
92
+
93
+ st.title("πŸ¦™ Llama Index Term Extractor πŸ¦™")
94
+ st.markdown(
95
+ (
96
+ "This demo allows you to upload your own documents (either a screenshot/image or the actual text) and extract terms and definitions, building a knowledge base!\n\n"
97
+ "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html) and OpenAI, you can augment the existing knowledge of an "
98
+ "LLM using your own notes, documents, and images. Then, when you ask about a term or definition, it will use your data first! "
99
+ )
100
+ )
101
+
102
+ setup_tab, terms_tab, upload_tab, query_tab = st.tabs(
103
+ ["Setup", "All Terms", "Upload/Extract Terms", "Query Terms"]
104
+ )
105
+
106
+ with setup_tab:
107
+ st.subheader("LLM Setup")
108
+ api_key = st.text_input("Enter your OpenAI API key here", type="password")
109
+ llm_name = st.selectbox(
110
+ "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
111
+ )
112
+ model_temperature = st.slider(
113
+ "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
114
+ )
115
+ term_extract_str = st.text_area(
116
+ "The query to extract terms and definitions with.", value=DEFAULT_TERM_STR
117
+ )
118
+
119
+
120
+ with terms_tab:
121
+ st.subheader("Current Extracted Terms and Definitions")
122
+ st.json(st.session_state["all_terms"])
123
+
124
+
125
+ with upload_tab:
126
+ st.subheader("Extract and Query Definitions")
127
+ if st.button("Initialize Index and Reset Terms", key="init_index_1"):
128
+ st.session_state["llama_index"] = initialize_index(
129
+ llm_name, model_temperature, api_key
130
+ )
131
+ st.session_state["all_terms"] = DEFAULT_TERMS
132
+
133
+ if "llama_index" in st.session_state:
134
+ st.markdown(
135
+ "Either upload an image/screenshot of a document, or enter the text manually."
136
+ )
137
+ uploaded_file = st.file_uploader(
138
+ "Upload an image/screenshot of a document:", type=["png", "jpg", "jpeg"]
139
+ )
140
+ document_text = st.text_area("Or enter raw text")
141
+ if st.button("Extract Terms and Definitions") and (
142
+ uploaded_file or document_text
143
+ ):
144
+ st.session_state["terms"] = {}
145
+ terms_docs = {}
146
+ with st.spinner("Extracting (images may be slow)..."):
147
+ if document_text:
148
+ terms_docs.update(
149
+ extract_terms(
150
+ [Document(document_text)],
151
+ term_extract_str,
152
+ llm_name,
153
+ model_temperature,
154
+ api_key,
155
+ )
156
+ )
157
+ if uploaded_file:
158
+ Image.open(uploaded_file).convert("RGB").save("temp.png")
159
+ img_reader = SimpleDirectoryReader(
160
+ input_files=["temp.png"], file_extractor=file_extractor
161
+ )
162
+ img_docs = img_reader.load_data()
163
+ os.remove("temp.png")
164
+ terms_docs.update(
165
+ extract_terms(
166
+ img_docs,
167
+ term_extract_str,
168
+ llm_name,
169
+ model_temperature,
170
+ api_key,
171
+ )
172
+ )
173
+ st.session_state["terms"].update(terms_docs)
174
+
175
+ if "terms" in st.session_state and st.session_state["terms"]:
176
+ st.markdown("Extracted terms")
177
+ st.json(st.session_state["terms"])
178
+
179
+ if st.button("Insert terms?"):
180
+ with st.spinner("Inserting terms"):
181
+ insert_terms(st.session_state["terms"])
182
+ st.session_state["all_terms"].update(st.session_state["terms"])
183
+ st.session_state["terms"] = {}
184
+ st.experimental_rerun()
185
+
186
+ with query_tab:
187
+ st.subheader("Query for Terms/Definitions!")
188
+ st.markdown(
189
+ (
190
+ "The LLM will attempt to answer your query, and augment it's answers using the terms/definitions you've inserted. "
191
+ "If a term is not in the index, it will answer using it's internal knowledge."
192
+ )
193
+ )
194
+ if st.button("Initialize Index and Reset Terms", key="init_index_2"):
195
+ st.session_state["llama_index"] = initialize_index(
196
+ llm_name, model_temperature, api_key
197
+ )
198
+ st.session_state["all_terms"] = DEFAULT_TERMS
199
+
200
+ if "llama_index" in st.session_state:
201
+ query_text = st.text_input("Ask about a term or definition:")
202
+ if query_text:
203
+ with st.spinner("Generating answer..."):
204
+ response = st.session_state["llama_index"].query(
205
+ query_text, similarity_top_k=5, response_mode="compact",
206
+ text_qa_template=TEXT_QA_TEMPLATE, refine_template=REFINE_TEMPLATE
207
+ )
208
+ st.markdown(str(response))
constants.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
2
+ from langchain.prompts.chat import (
3
+ AIMessagePromptTemplate,
4
+ ChatPromptTemplate,
5
+ HumanMessagePromptTemplate,
6
+ )
7
+
8
+ from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
9
+
10
+ # Text QA templates
11
+ DEFAULT_TEXT_QA_PROMPT_TMPL = (
12
+ "Context information is below. \n"
13
+ "---------------------\n"
14
+ "{context_str}"
15
+ "\n---------------------\n"
16
+ "Given the context information answer the following question "
17
+ "(if you don't know the answer, use the best of your knowledge): {query_str}\n"
18
+ )
19
+ TEXT_QA_TEMPLATE = QuestionAnswerPrompt(DEFAULT_TEXT_QA_PROMPT_TMPL)
20
+
21
+ # Refine templates
22
+ DEFAULT_REFINE_PROMPT_TMPL = (
23
+ "The original question is as follows: {query_str}\n"
24
+ "We have provided an existing answer: {existing_answer}\n"
25
+ "We have the opportunity to refine the existing answer "
26
+ "(only if needed) with some more context below.\n"
27
+ "------------\n"
28
+ "{context_msg}\n"
29
+ "------------\n"
30
+ "Given the new context and using the best of your knowledge, improve the existing answer. "
31
+ "If you can't improve the existing answer, just repeat it again. "
32
+ "Do not mention that you've read the above context."
33
+ )
34
+ DEFAULT_REFINE_PROMPT = RefinePrompt(DEFAULT_REFINE_PROMPT_TMPL)
35
+
36
+ CHAT_REFINE_PROMPT_TMPL_MSGS = [
37
+ HumanMessagePromptTemplate.from_template("{query_str}"),
38
+ AIMessagePromptTemplate.from_template("{existing_answer}"),
39
+ HumanMessagePromptTemplate.from_template(
40
+ "We have the opportunity to refine the above answer "
41
+ "(only if needed) with some more context below.\n"
42
+ "------------\n"
43
+ "{context_msg}\n"
44
+ "------------\n"
45
+ "Given the new context and using the best of your knowledge, improve the existing answer. "
46
+ "If you can't improve the existing answer, just repeat it again. "
47
+ "Do not mention that you've read the above context."
48
+ ),
49
+ ]
50
+
51
+ CHAT_REFINE_PROMPT_LC = ChatPromptTemplate.from_messages(CHAT_REFINE_PROMPT_TMPL_MSGS)
52
+ CHAT_REFINE_PROMPT = RefinePrompt.from_langchain_prompt(CHAT_REFINE_PROMPT_LC)
53
+
54
+ # refine prompt selector
55
+ DEFAULT_REFINE_PROMPT_SEL_LC = ConditionalPromptSelector(
56
+ default_prompt=DEFAULT_REFINE_PROMPT.get_langchain_prompt(),
57
+ conditionals=[(is_chat_model, CHAT_REFINE_PROMPT.get_langchain_prompt())],
58
+ )
59
+ REFINE_TEMPLATE = RefinePrompt(
60
+ langchain_prompt_selector=DEFAULT_REFINE_PROMPT_SEL_LC
61
+ )
62
+
63
+ DEFAULT_TERM_STR = (
64
+ "Make a list of terms and definitions that are defined in the context, "
65
+ "with one pair on each line. "
66
+ "If a term is missing it's definition, use your best judgment. "
67
+ "Write each line as as follows:\nTerm: <term> Definition: <definition>"
68
+ )
69
+
70
+ DEFAULT_TERMS = {
71
+ "New York City": "The most populous city in the United States, located at the southern tip of New York State, and the largest metropolitan area in the U.S. by both population and urban area.",
72
+ "boroughs": "Five administrative divisions of New York City, each coextensive with a respective county of the state of New York: Brooklyn, Queens, Manhattan, The Bronx, and Staten Island.",
73
+ "metropolitan statistical area": "A geographical region with a relatively high population density at its core and close economic ties throughout the area.",
74
+ "combined statistical area": "A combination of adjacent metropolitan and micropolitan statistical areas in the United States and Puerto Rico that can demonstrate economic or social linkage.",
75
+ "megacities": "A city with a population of over 10 million people.",
76
+ "United Nations": "An intergovernmental organization that aims to maintain international peace and security, develop friendly relations among nations, achieve international cooperation, and be a center for harmonizing the actions of nations.",
77
+ "Pulitzer Prizes": "A series of annual awards for achievements in journalism, literature, and musical composition in the United States.",
78
+ "Times Square": "A major commercial and tourist destination in Manhattan, New York City.",
79
+ "New Netherland": "A Dutch colony in North America that existed from 1614 until 1664.",
80
+ "Dutch West India Company": "A Dutch trading company that operated as a monopoly in New Netherland from 1621 until 1639-1640.",
81
+ "patroon system": "A system instituted by the Dutch to attract settlers to New Netherland, whereby wealthy Dutchmen who brought 50 colonists would be awarded land and local political autonomy.",
82
+ "Peter Stuyvesant": "The last Director-General of New Netherland, who served from 1647 until 1664.",
83
+ "Treaty of Breda": "A treaty signed in 1667 between the Dutch and English that resulted in the Dutch keeping Suriname and the English keeping New Amsterdam (which was renamed New York).",
84
+ "African Burying Ground": "A cemetery discovered in Foley Square in the 1990s that included 10,000 to 20,000 graves of colonial-era Africans, some enslaved and some free.",
85
+ "Stamp Act Congress": "A meeting held in New York in 1765 in response to the Stamp Act, which imposed taxes on printed materials in the American colonies.",
86
+ "Battle of Long Island": "The largest battle of the American Revolutionary War, fought on August 27, 1776, in Brooklyn, New York City.",
87
+ "New York Police Department": "The police force of New York City.",
88
+ "Irish immigrants": "People who immigrated to the United States from Ireland.",
89
+ "lynched": "To kill someone, especially by hanging, without a legal trial.",
90
+ "civil unrest": "A situation in which people in a country are angry and likely to protest or fight.",
91
+ "megacity": "A very large city, typically one with a population of over ten million people.",
92
+ "World Trade Center": "A complex of buildings in Lower Manhattan, New York City, that were destroyed in the September 11 attacks.",
93
+ "COVID-19": "A highly infectious respiratory illness caused by the SARS-CoV-2 virus.",
94
+ "monkeypox outbreak": "An outbreak of a viral disease similar to smallpox, which occurred in the LGBT community in New York City in 2022.",
95
+ "Hudson River": "A river in the northeastern United States, flowing from the Adirondack Mountains in New York into the Atlantic Ocean.",
96
+ "estuary": "A partly enclosed coastal body of brackish water with one or more rivers or streams flowing into it, and with a free connection to the open sea.",
97
+ "East River": "A tidal strait in New York City.",
98
+ "Five Boroughs": "Refers to the five counties that make up New York City: Bronx, Brooklyn, Manhattan, Queens, and Staten Island.",
99
+ "Staten Island": "The most suburban of the five boroughs, located southwest of Manhattan and connected to it by the free Staten Island Ferry.",
100
+ "Todt Hill": "The highest point on the eastern seaboard south of Maine, located on Staten Island.",
101
+ "Manhattan": "The geographically smallest and most densely populated borough of New York City, known for its skyscrapers, Central Park, and cultural, administrative, and financial centers.",
102
+ "Brooklyn": "The most populous borough of New York City, located on the western tip of Long Island and known for its cultural diversity, independent art scene, and distinctive neighborhoods.",
103
+ "Queens": "The largest borough of New York City, located on Long Island north and east of Brooklyn, and known for its ethnic diversity, commercial and residential prominence, and hosting of the annual U.S. Open tennis tournament.",
104
+ "The Bronx": "The northernmost borough of New York",
105
+ }
index.json ADDED
The diff for this file is too large to render. See raw diff
 
nyc_wiki.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ langchain==0.0.128
2
+ llama-index==0.5.4
3
+ Pillow==9.4.0
4
+ streamlit==1.19.0
terms_definitions_tutorial.md ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Llama Index Problem Solving - Extracting Terms and Definitions
2
+
3
+ Llama Index has many use cases (semantic search, summarization, etc.) that are [well documented](https://gpt-index.readthedocs.io/en/latest/use_cases/queries.html). However, this doesn't mean we can't apply Llama Index to very specific use cases!
4
+
5
+ In this tutorial, we will go through the design process of using Llama Index to extract terms and definitions from text, while allowing users to query those terms later. Using [Streamlit](https://streamlit.io/), we can provide an easy to build frontend for running and testing all of this, and quickly iterate with our design.
6
+
7
+ This tutorial assumes you have the following packages installed:
8
+
9
+ - python3.9+
10
+ - llama_index
11
+ - streamlit
12
+
13
+ At the base level, our objective is to take text from a document, extract terms and definitions, and then provide a way for users to query that knowledge base of terms and definitions. The tutorial will go over features from both Llama Index and Streamlit, and hopefully provide some interesting solutions for common problems that come up.
14
+
15
+ The final version of this tutorial can be found [here](https://github.com/logan-markewich/llama_index_starter_pack).
16
+
17
+ ## Uploading Text
18
+
19
+ So step one is giving users a way to upload documents. Let’s write some code using Streamlit to provide the interface for this! Use the following code and launch the app with `streamlit run app.py`.
20
+
21
+ ```python
22
+ import streamlit as st
23
+
24
+ st.title("πŸ¦™ Llama Index Term Extractor πŸ¦™")
25
+
26
+ document_text = st.text_area("Or enter raw text")
27
+ if st.button("Extract Terms and Definitions") and document_text:
28
+ with st.spinner("Extracting..."):
29
+ extracted_terms = document text # this is a placeholder!
30
+ st.write(extracted_terms)
31
+ ```
32
+
33
+ Super simple right! But you'll notice that the app doesn't do anything useful yet. To use llama_index, we also need to setup our OpenAI LLM. There are a bunch of possible settings for the LLM, so we can let the user figure out what's best. We should also let the user set the prompt that will extract the terms (which will also help us debug what works best).
34
+
35
+ ## LLM Settings
36
+
37
+ This next step introduces some tabs to our app, to separate it into different panes that provide different features. Let's create a tab for LLM settings and for uploading text:
38
+
39
+ ```python
40
+ import os
41
+ import streamlit as st
42
+
43
+ DEFAULT_TERM_STR = (
44
+ "Make a list of terms and definitions that are defined in the context, "
45
+ "with one pair on each line. "
46
+ "If a term is missing it's definition, use your best judgment. "
47
+ "Write each line as as follows:\nTerm: <term> Definition: <definition>"
48
+ )
49
+
50
+ st.title("πŸ¦™ Llama Index Term Extractor πŸ¦™")
51
+
52
+ setup_tab, upload_tab = st.tabs(["Setup", "Upload/Extract Terms"])
53
+
54
+ with setup_tab:
55
+ st.subheader("LLM Setup")
56
+ api_key = st.text_input("Enter your OpenAI API key here", type="password")
57
+ llm_name = st.selectbox('Which LLM?', ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"])
58
+ model_temperature = st.slider("LLM Temperature", min_value=0.0, max_value=1.0, step=0.1)
59
+ term_extract_str = st.text_area("The query to extract terms and definitions with.", value=DEFAULT_TERM_STR)
60
+
61
+ with upload_tab:
62
+ st.subheader("Extract and Query Definitions")
63
+ document_text = st.text_area("Or enter raw text")
64
+ if st.button("Extract Terms and Definitions") and document_text:
65
+ with st.spinner("Extracting..."):
66
+ extracted_terms = document text # this is a placeholder!
67
+ st.write(extracted_terms)
68
+ ```
69
+
70
+ Now our app has two tabs, which really helps with the organization. You'll also noticed I added a default prompt to extract terms -- you can change this later once you try extracting some terms, it's just the prompt I arrived at after experimenting a bit.
71
+
72
+ Speaking of extracting terms, it's time to add some functions to do just that!
73
+
74
+ ## Extracting and Storing Terms
75
+
76
+ Now that we are able to define LLM settings and upload text, we can try using Llama Index to extract the terms from text for us!
77
+
78
+ We can add the following functions to both initialize our LLM, as well as use it to extract terms from the input text.
79
+
80
+ ```python
81
+ from llama_index import Document, GPTListIndex, LLMPredictor, ServiceContext, PromptHelper
82
+
83
+ def get_llm(llm_name, model_temperature, api_key, max_tokens=256):
84
+ os.environ['OPENAI_API_KEY'] = api_key
85
+ if llm_name == "text-davinci-003":
86
+ return OpenAI(temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens)
87
+ else:
88
+ return ChatOpenAI(temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens)
89
+
90
+ def extract_terms(documents, term_extract_str, llm_name, model_temperature, api_key):
91
+ llm = get_llm(llm_name, model_temperature, api_key, max_tokens=1024)
92
+
93
+ service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm),
94
+ prompt_helper=PromptHelper(max_input_size=4096,
95
+ max_chunk_overlap=20,
96
+ num_output=1024),
97
+ chunk_size_limit=1024)
98
+
99
+ temp_index = GPTListIndex.from_documents(documents, service_context=service_context)
100
+ terms_definitions = str(temp_index.query(term_extract_str, response_mode="tree_summarize"))
101
+ terms_definitions = [x for x in terms_definitions.split("\n") if x and 'Term:' in x and 'Definition:' in x]
102
+ # parse the text into a dict
103
+ terms_to_definition = {x.split("Definition:")[0].split("Term:")[-1].strip(): x.split("Definition:")[-1].strip() for x in terms_definitions}
104
+ return terms_to_definition
105
+ ```
106
+
107
+ Now, using the new functions, we can finally extract our terms!
108
+
109
+ ```python
110
+ ...
111
+ with upload_tab:
112
+ st.subheader("Extract and Query Definitions")
113
+ document_text = st.text_area("Or enter raw text")
114
+ if st.button("Extract Terms and Definitions") and document_text:
115
+ with st.spinner("Extracting..."):
116
+ extracted_terms = extract_terms([Document(document_text)],
117
+ term_extract_str, llm_name,
118
+ model_temperature, api_key)
119
+ st.write(extracted_terms)
120
+ ```
121
+
122
+ There's a lot going on now, so let's take a moment to go over what is happening.
123
+
124
+ `get_llm()` is instantiating the LLM based on the user configuration from the setup tab. Based on the model name, we need to use the appropriate class (`OpenAI` vs. `ChatOpenAI`).
125
+
126
+ `extract_terms()` is where all the good stuff happens. First, we call `get_llm()` with `max_tokens=1024`, since we don't want to limit the model too much when it is extracting our terms and definitions (the default is 256 if not set). Then, we define our `ServiceContext` object, aligning `num_output` with our `max_tokens` value, as well as setting the chunk size to be no larger than the output. When documents are indexed by Llama Index, they are broken into chunks (also called nodes) if they are large, and `chunk_size_limit` sets the maximum size for these chunks.
127
+
128
+ Next, we create a temporary list index and pass in our service context. A list index will read every single piece of text in our index, which is perfect for extracting terms. Finally, we use are pre-define query text to extract terms, using `response_mode="tree_summarize`. This response mode will generate a tree of summaries from the bottom up, where each parent summarizes its children. Finally, the top of the tree is returned, which will contain all our extracted terms and definitions.
129
+
130
+ Lastly, we do some minor post processing. We assume the model followed instructions and put a term/definition pair on each line. If a line is missing the `Term:` or `Definition:` labels, we skip it. Then, we convert this to a dictionary for easy storage!
131
+
132
+ ## Saving Extracted Terms
133
+
134
+ Now that we can extract terms, we need to put them somewhere so that we can query for them later. A `GPTSimpleVectorIndex` should be a perfect choice for now! But in addition, our app should also keep track of which terms are inserted into the index so that we can inspect them later. Using `st.session_state`, we can store the current list of terms in a session dict, unique to each user!
135
+
136
+ First things first though, let's add a feature to initialize a global vector index and another function to insert the extracted terms.
137
+
138
+ ```python
139
+ ...
140
+ if 'all_terms' not in st.session_state:
141
+ st.session_state['all_terms'] = DEFAULT_TERMS
142
+ ...
143
+
144
+ def insert_terms(terms_to_definition):
145
+ for term, definition in terms_to_definition.items():
146
+ doc = Document(f"Term: {term}\nDefinition: {definition}")
147
+ st.session_state['llama_index'].insert(doc)
148
+
149
+ @st.cache_resource
150
+ def initialize_index(llm_name, model_temperature, api_key):
151
+ """Create the GPTSQLStructStoreIndex object."""
152
+ llm = get_llm(llm_name, model_temperature, api_key)
153
+
154
+ service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
155
+
156
+ index = GPTSimpleVectorIndex([], service_context=service_context)
157
+
158
+ return index
159
+
160
+ ...
161
+
162
+ with upload_tab:
163
+ st.subheader("Extract and Query Definitions")
164
+ if st.button("Initialize Index and Reset Terms"):
165
+ st.session_state['llama_index'] = initialize_index(llm_name, model_temperature, api_key)
166
+ st.session_state['all_terms'] = {}
167
+
168
+ if "llama_index" in st.session_state:
169
+ st.markdown("Either upload an image/screenshot of a document, or enter the text manually.")
170
+ document_text = st.text_area("Or enter raw text")
171
+ if st.button("Extract Terms and Definitions") and (uploaded_file or document_text):
172
+ st.session_state['terms'] = {}
173
+ terms_docs = {}
174
+ with st.spinner("Extracting..."):
175
+ terms_docs.update(extract_terms([Document(document_text)], term_extract_str, llm_name, model_temperature, api_key))
176
+ st.session_state['terms'].update(terms_docs)
177
+
178
+ if "terms" in st.session_state and st.session_state["terms"]::
179
+ st.markdown("Extracted terms")
180
+ st.json(st.session_state['terms'])
181
+
182
+ if st.button("Insert terms?"):
183
+ with st.spinner("Inserting terms"):
184
+ insert_terms(st.session_state['terms'])
185
+ st.session_state['all_terms'].update(st.session_state['terms'])
186
+ st.session_state['terms'] = {}
187
+ st.experimental_rerun()
188
+ ```
189
+
190
+ Now you are really starting to leverage the power of streamlit! Let's start with the code under the upload tab. We added a button to initialize the vector index, and we store it in the global streamlit state dictionary, as well as resetting the currently extracted terms. Then, after extracting terms from the input text, we store it the extracted terms in the global state again and give the user a chance to review them before inserting. If the insert button is pressed, then we call our insert terms function, update our global tracking of inserted terms, and remove the most recently extracted terms from the session state.
191
+
192
+ ## Querying for Extracted Terms/Definitions
193
+
194
+ With the terms and definitions extracted and saved, how can we use them? And how will the user even remember what's previously been saved?? We can simply add some more tabs to the app to handle these features.
195
+
196
+ ```python
197
+ ...
198
+ setup_tab, terms_tab, upload_tab, query_tab = st.tabs(
199
+ ["Setup", "All Terms", "Upload/Extract Terms", "Query Terms"]
200
+ )
201
+ ...
202
+ with terms_tab:
203
+ with terms_tab:
204
+ st.subheader("Current Extracted Terms and Definitions")
205
+ st.json(st.session_state["all_terms"])
206
+ ...
207
+ with query_tab:
208
+ st.subheader("Query for Terms/Definitions!")
209
+ st.markdown(
210
+ (
211
+ "The LLM will attempt to answer your query, and augment it's answers using the terms/definitions you've inserted. "
212
+ "If a term is not in the index, it will answer using it's internal knowledge."
213
+ )
214
+ )
215
+ if st.button("Initialize Index and Reset Terms", key="init_index_2"):
216
+ st.session_state["llama_index"] = initialize_index(
217
+ llm_name, model_temperature, api_key
218
+ )
219
+ st.session_state["all_terms"] = {}
220
+
221
+ if "llama_index" in st.session_state:
222
+ query_text = st.text_input("Ask about a term or definition:")
223
+ if query_text:
224
+ query_text = query_text + "\nIf you can't find the answer, answer the query with the best of your knowledge."
225
+ with st.spinner("Generating answer..."):
226
+ response = st.session_state["llama_index"].query(
227
+ query_text, similarity_top_k=5, response_mode="compact"
228
+ )
229
+ st.markdown(str(response))
230
+ ```
231
+
232
+ While this is mostly basic, some important things to note:
233
+
234
+ - Our initialize button has the same text as our other button. Streamlit will complain about this, so we provide a unique key instead.
235
+ - Some additional text has been added to the query! This is to try and compensate for times when the index does not have the answer.
236
+ - In our index query, we've specified two options:
237
+ - `similarity_top_k=5` means the index will fetch the top 5 closest matching terms/definitions to the query.
238
+ - `response_mode="compact"` means as much text as possible from the 5 matching terms/definitions will be used in each LLM call. Without this, the index would make at least 5 calls to the LLM, which can slow things down for the user.
239
+
240
+ ## Dry Run Test
241
+
242
+ Well, actually I hope you've been testing as we went. But now, let's try one complete test.
243
+
244
+ 1. Refresh the app
245
+ 2. Enter your LLM settings
246
+ 3. Head over to the query tab
247
+ 4. Ask the following: `What is a bunnyhug?`
248
+ 5. The app should give some nonsense response. If you didn't know, a bunnyhug is another word for a hoodie, used by people from the Canadian Prairies!
249
+ 6. Let's add this definition to the app. Open the upload tab and enter the following text: `A bunnyhug is a common term used to describe a hoodie. This term is used by people from the Canadian Prairies.`
250
+ 7. Click the extract button. After a few moments, the app should display the correctly extracted term/definition. Click the insert term button to save it!
251
+ 8. If we open the terms tab, the term and definition we just extracted should be displayed
252
+ 9. Go back to the query tab and try asking what a bunnyhug is. Now, the answer should be correct!
253
+
254
+ ## Improvement #1 - Create a Starting Index
255
+
256
+ With our base app working, it might feel like a lot of work to build up a useful index. What if we gave the user some kind of starting point to show off the app's query capabilities? We can do just that! First, let's make a small change to our app so that we save the index to disk after every upload:
257
+
258
+ ```python
259
+ def insert_terms(terms_to_definition):
260
+ for term, definition in terms_to_definition.items():
261
+ doc = Document(f"Term: {term}\nDefinition: {definition}")
262
+ st.session_state['llama_index'].insert(doc)
263
+ # TEMPORARY - save to disk
264
+ st.session_state['llama_index'].save_to_disk("index.json")
265
+ ```
266
+
267
+ Now, we need some document to extract from! The repository for this project used the wikipedia page on New York City, and you can find the text [here](https://github.com/jerryjliu/llama_index/blob/main/examples/test_wiki/data/nyc_text.txt).
268
+
269
+ If you paste the text into the upload tab and run it (it may take some time), we can insert the extracted terms. Make sure to also copy the text for the extracted terms into a notepad or similar before inserting into the index! We will need them in a second.
270
+
271
+ After inserting, remove the line of code we used to save the index to disk. With a starting index now saved, we can modify our `initialize_index` function to look like this:
272
+
273
+ ```python
274
+ @st.cache_resource
275
+ def initialize_index(llm_name, model_temperature, api_key):
276
+ """Create the GPTSQLStructStoreIndex object."""
277
+ llm = get_llm(llm_name, model_temperature, api_key)
278
+
279
+ service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
280
+
281
+ index = GPTSimpleVectorIndex.load_from_disk(
282
+ "./index.json", service_context=service_context
283
+ )
284
+
285
+ return index
286
+ ```
287
+
288
+ Did you remember to save that giant list of extracted terms in a notepad? Now when our app initializes, we want to pass in the default terms that are in the index to our global terms state:
289
+
290
+ ```python
291
+ ...
292
+ if "all_terms" not in st.session_state:
293
+ st.session_state["all_terms"] = DEFAULT_TERMS
294
+ ...
295
+ ```
296
+
297
+ Repeat the above anywhere where we were previously resetting the `all_terms` values.
298
+
299
+ ## Improvement #2 - (Refining) Better Prompts
300
+
301
+ If you play around with the app a bit now, you might notice that it stopped following our prompt! Remember, we added to our `query_str` variable that if the term/definition could not be found, answer to the best of it's knowledge. But now if you try asking about random terms (like bunnyhug!), it may or may not follow those instructions.
302
+
303
+ This is due to the concept of "refining" answers in Llama Index. Since we are querying across the top 5 matching results, sometimes all the results do not fit in a single prompt! OpenAI models typically have a max input size of 4097 tokens. So, Llama Index accounts for this by breaking up the matching results into chunks that will fit into the prompt. After Llama Index gets an initial answer from the first API call, it sends the next chunk to the API, along with the previous answer, and asks the model to refine that answer.
304
+
305
+ So, the refine process seems to be messing with our results! Rather than appending extra instructions to the `query_str`, remove that, and Llama Index will let us provide our own custom prompts! Let's create those now, using the [default prompts](https://github.com/jerryjliu/llama_index/blob/main/gpt_index/prompts/default_prompts.py) and [chat specific prompts](https://github.com/jerryjliu/llama_index/blob/main/gpt_index/prompts/chat_prompts.py) as a guide. Using a new file `constants.py`, let's create some new query templates:
306
+
307
+ ```python
308
+ from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
309
+ from langchain.prompts.chat import (
310
+ AIMessagePromptTemplate,
311
+ ChatPromptTemplate,
312
+ HumanMessagePromptTemplate,
313
+ )
314
+
315
+ from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
316
+
317
+ # Text QA templates
318
+ DEFAULT_TEXT_QA_PROMPT_TMPL = (
319
+ "Context information is below. \n"
320
+ "---------------------\n"
321
+ "{context_str}"
322
+ "\n---------------------\n"
323
+ "Given the context information answer the following question "
324
+ "(if you don't know the answer, use the best of your knowledge): {query_str}\n"
325
+ )
326
+ TEXT_QA_TEMPLATE = QuestionAnswerPrompt(DEFAULT_TEXT_QA_PROMPT_TMPL)
327
+
328
+ # Refine templates
329
+ DEFAULT_REFINE_PROMPT_TMPL = (
330
+ "The original question is as follows: {query_str}\n"
331
+ "We have provided an existing answer: {existing_answer}\n"
332
+ "We have the opportunity to refine the existing answer "
333
+ "(only if needed) with some more context below.\n"
334
+ "------------\n"
335
+ "{context_msg}\n"
336
+ "------------\n"
337
+ "Given the new context and using the best of your knowledge, improve the existing answer. "
338
+ "If you can't improve the existing answer, just repeat it again."
339
+ )
340
+ DEFAULT_REFINE_PROMPT = RefinePrompt(DEFAULT_REFINE_PROMPT_TMPL)
341
+
342
+ CHAT_REFINE_PROMPT_TMPL_MSGS = [
343
+ HumanMessagePromptTemplate.from_template("{query_str}"),
344
+ AIMessagePromptTemplate.from_template("{existing_answer}"),
345
+ HumanMessagePromptTemplate.from_template(
346
+ "We have the opportunity to refine the above answer "
347
+ "(only if needed) with some more context below.\n"
348
+ "------------\n"
349
+ "{context_msg}\n"
350
+ "------------\n"
351
+ "Given the new context and using the best of your knowledge, improve the existing answer. "
352
+ "If you can't improve the existing answer, just repeat it again."
353
+ ),
354
+ ]
355
+
356
+ CHAT_REFINE_PROMPT_LC = ChatPromptTemplate.from_messages(CHAT_REFINE_PROMPT_TMPL_MSGS)
357
+ CHAT_REFINE_PROMPT = RefinePrompt.from_langchain_prompt(CHAT_REFINE_PROMPT_LC)
358
+
359
+ # refine prompt selector
360
+ DEFAULT_REFINE_PROMPT_SEL_LC = ConditionalPromptSelector(
361
+ default_prompt=DEFAULT_REFINE_PROMPT.get_langchain_prompt(),
362
+ conditionals=[(is_chat_model, CHAT_REFINE_PROMPT.get_langchain_prompt())],
363
+ )
364
+ REFINE_TEMPLATE = RefinePrompt(
365
+ langchain_prompt_selector=DEFAULT_REFINE_PROMPT_SEL_LC
366
+ )
367
+ ```
368
+
369
+ So that seems like a lot of code, but it's not too bad! If you looked at the default prompts, you might have noticed that there are default prompts, and prompts specific to chat models. Continuing that trend, we do the same for our custom prompts. Then, using a prompt selector, we can combine both prompts into a single object. If the LLM being used is a chat model (ChatGPT, GPT-4), then the chat prompts are used. Otherwise, use the normal prompt templates.
370
+
371
+ Another thing to note is that we only defined one QA template. In a chat model, this will be converted to a single "human" message.
372
+
373
+ So, now we can import these prompts into our app and use them during the query.
374
+
375
+ ```python
376
+ from constants import REFINE_TEMPLATE, TEXT_QA_TEMPLATE
377
+ ...
378
+ if "llama_index" in st.session_state:
379
+ query_text = st.text_input("Ask about a term or definition:")
380
+ if query_text:
381
+ query_text = query_text # Notice we removed the old instructions
382
+ with st.spinner("Generating answer..."):
383
+ response = st.session_state["llama_index"].query(
384
+ query_text, similarity_top_k=5, response_mode="compact",
385
+ text_qa_template=TEXT_QA_TEMPLATE, refine_template=REFINE_TEMPLATE
386
+ )
387
+ st.markdown(str(response))
388
+ ...
389
+ ```
390
+
391
+ If you experiment a bit more with queries, hopefully you notice that the responses follow our instructions a little better now!
392
+
393
+ ## Improvement #3 - Image Support
394
+
395
+ Llama index also supports images! Using Llama Index, we can upload images of documents (papers, letters, etc.), and Llama Index handles extracting the text. We can leverage this to also allow users to upload images of their documents and extract terms and definitions from them.
396
+
397
+ If you get an import error about PIL, install it using `pip install Pillow` first.
398
+
399
+ ```python
400
+ from PIL import Image
401
+ from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR, ImageParser
402
+
403
+ @st.cache_resource
404
+ def get_file_extractor():
405
+ image_parser = ImageParser(keep_image=True, parse_text=True)
406
+ file_extractor = DEFAULT_FILE_EXTRACTOR
407
+ file_extractor.update(
408
+ {
409
+ ".jpg": image_parser,
410
+ ".png": image_parser,
411
+ ".jpeg": image_parser,
412
+ }
413
+ )
414
+
415
+ return file_extractor
416
+
417
+ file_extractor = get_file_extractor()
418
+ ...
419
+ with upload_tab:
420
+ st.subheader("Extract and Query Definitions")
421
+ if st.button("Initialize Index and Reset Terms", key="init_index_1"):
422
+ st.session_state["llama_index"] = initialize_index(
423
+ llm_name, model_temperature, api_key
424
+ )
425
+ st.session_state["all_terms"] = DEFAULT_TERMS
426
+
427
+ if "llama_index" in st.session_state:
428
+ st.markdown(
429
+ "Either upload an image/screenshot of a document, or enter the text manually."
430
+ )
431
+ uploaded_file = st.file_uploader(
432
+ "Upload an image/screenshot of a document:", type=["png", "jpg", "jpeg"]
433
+ )
434
+ document_text = st.text_area("Or enter raw text")
435
+ if st.button("Extract Terms and Definitions") and (
436
+ uploaded_file or document_text
437
+ ):
438
+ st.session_state["terms"] = {}
439
+ terms_docs = {}
440
+ with st.spinner("Extracting (images may be slow)..."):
441
+ if document_text:
442
+ terms_docs.update(
443
+ extract_terms(
444
+ [Document(document_text)],
445
+ term_extract_str,
446
+ llm_name,
447
+ model_temperature,
448
+ api_key,
449
+ )
450
+ )
451
+ if uploaded_file:
452
+ Image.open(uploaded_file).convert("RGB").save("temp.png")
453
+ img_reader = SimpleDirectoryReader(
454
+ input_files=["temp.png"], file_extractor=file_extractor
455
+ )
456
+ img_docs = img_reader.load_data()
457
+ os.remove("temp.png")
458
+ terms_docs.update(
459
+ extract_terms(
460
+ img_docs,
461
+ term_extract_str,
462
+ llm_name,
463
+ model_temperature,
464
+ api_key,
465
+ )
466
+ )
467
+ st.session_state["terms"].update(terms_docs)
468
+
469
+ if "terms" in st.session_state and st.session_state["terms"]:
470
+ st.markdown("Extracted terms")
471
+ st.json(st.session_state["terms"])
472
+
473
+ if st.button("Insert terms?"):
474
+ with st.spinner("Inserting terms"):
475
+ insert_terms(st.session_state["terms"])
476
+ st.session_state["all_terms"].update(st.session_state["terms"])
477
+ st.session_state["terms"] = {}
478
+ st.experimental_rerun()
479
+ ```
480
+
481
+ Here, we added the option to upload a file using Streamlit. Then the image is opened and saved to disk (this seems hacky but it keeps things simple). Then we pass the image path to the reader, extract the documents/text, and remove our temp image file.
482
+
483
+ Now that we have the documents, we can call `extract_terms()` the same as before.
484
+
485
+ ## Conclusion/TLDR
486
+
487
+ In this tutorial, we covered a ton of information, while solving some common issues and problems along the way:
488
+
489
+ - Using different indexes for different use cases (List vs. Vector index)
490
+ - Storing global state values with Streamlit's `session_state` concept
491
+ - Customizing internal prompts with Llama Index
492
+ - Reading text from images with Llama Index
493
+
494
+ The final version of this tutorial can be found [here](https://github.com/logan-markewich/llama_index_starter_pack).
utils.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from langchain import OpenAI
3
+ from langchain.chat_models import ChatOpenAI
4
+
5
+
6
+ def get_llm(llm_name, model_temperature, api_key, max_tokens=256):
7
+ os.environ["OPENAI_API_KEY"] = api_key
8
+ if llm_name == "text-davinci-003":
9
+ return OpenAI(
10
+ temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens
11
+ )
12
+ else:
13
+ return ChatOpenAI(
14
+ temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens
15
+ )