Spaces:

llamaindex
/

llama_index_term_definition_demo

Running

App Files Files Community

cheesyFishes commited on Apr 6, 2023

Commit

b0bfa1a

•

1 Parent(s): c755c9e

initial commit

Browse files

Files changed (8) hide show

README.md +4 -4
app.py +208 -0
constants.py +105 -0
index.json +0 -0
nyc_wiki.txt +0 -0
requirements.txt +4 -0
terms_definitions_tutorial.md +494 -0
utils.py +15 -0

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
 ---
 title: Llama Index Term Definition Demo
-emoji: 😻
-colorFrom: pink
 colorTo: purple
 sdk: streamlit
-sdk_version: 1.17.0
 app_file: app.py
-pinned: false
 license: mit
 ---

 ---
 title: Llama Index Term Definition Demo
+emoji: 🦙📚
+colorFrom: blue
 colorTo: purple
 sdk: streamlit
+sdk_version: 1.19.0
 app_file: app.py
+pinned: true
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+import streamlit as st
+from PIL import Image
+from llama_index import (
+    Document,
+    GPTSimpleVectorIndex,
+    GPTListIndex,
+    LLMPredictor,
+    ServiceContext,
+    SimpleDirectoryReader,
+    PromptHelper,
+)
+from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR, ImageParser
+from constants import DEFAULT_TERM_STR, DEFAULT_TERMS, REFINE_TEMPLATE, TEXT_QA_TEMPLATE
+from utils import get_llm
+if "all_terms" not in st.session_state:
+    st.session_state["all_terms"] = DEFAULT_TERMS
+@st.cache_resource
+def get_file_extractor():
+    image_parser = ImageParser(keep_image=True, parse_text=True)
+    file_extractor = DEFAULT_FILE_EXTRACTOR
+    file_extractor.update(
+        {
+            ".jpg": image_parser,
+            ".png": image_parser,
+            ".jpeg": image_parser,
+        }
+    )
+    return file_extractor
+file_extractor = get_file_extractor()
+def extract_terms(documents, term_extract_str, llm_name, model_temperature, api_key):
+    llm = get_llm(llm_name, model_temperature, api_key, max_tokens=1024)
+    service_context = ServiceContext.from_defaults(
+        llm_predictor=LLMPredictor(llm=llm),
+        prompt_helper=PromptHelper(
+            max_input_size=4096, max_chunk_overlap=20, num_output=1024
+        ),
+        chunk_size_limit=1024,
+    )
+    temp_index = GPTListIndex.from_documents(documents, service_context=service_context)
+    terms_definitions = str(
+        temp_index.query(term_extract_str, response_mode="tree_summarize")
+    )
+    terms_definitions = [
+        x
+        for x in terms_definitions.split("\n")
+        if x and "Term:" in x and "Definition:" in x
+    ]
+    # parse the text into a dict
+    terms_to_definition = {
+        x.split("Definition:")[0]
+        .split("Term:")[-1]
+        .strip(): x.split("Definition:")[-1]
+        .strip()
+        for x in terms_definitions
+    }
+    return terms_to_definition
+def insert_terms(terms_to_definition):
+    for term, definition in terms_to_definition.items():
+        doc = Document(f"Term: {term}\nDefinition: {definition}")
+        st.session_state["llama_index"].insert(doc)
+@st.cache_resource
+def initialize_index(llm_name, model_temperature, api_key):
+    """Create the GPTSQLStructStoreIndex object."""
+    llm = get_llm(llm_name, model_temperature, api_key)
+    service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
+    index = GPTSimpleVectorIndex.load_from_disk(
+        "./index.json", service_context=service_context
+    )
+    return index
+st.title("🦙 Llama Index Term Extractor 🦙")
+st.markdown(
+    (
+        "This demo allows you to upload your own documents (either a screenshot/image or the actual text) and extract terms and definitions, building a knowledge base!\n\n"
+        "Powered by [Llama Index](https://gpt-index.readthedocs.io/en/latest/index.html) and OpenAI, you can augment the existing knowledge of an "
+        "LLM using your own notes, documents, and images. Then, when you ask about a term or definition, it will use your data first! "
+    )
+)
+setup_tab, terms_tab, upload_tab, query_tab = st.tabs(
+    ["Setup", "All Terms", "Upload/Extract Terms", "Query Terms"]
+)
+with setup_tab:
+    st.subheader("LLM Setup")
+    api_key = st.text_input("Enter your OpenAI API key here", type="password")
+    llm_name = st.selectbox(
+        "Which LLM?", ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"]
+    )
+    model_temperature = st.slider(
+        "LLM Temperature", min_value=0.0, max_value=1.0, step=0.1
+    )
+    term_extract_str = st.text_area(
+        "The query to extract terms and definitions with.", value=DEFAULT_TERM_STR
+    )
+with terms_tab:
+    st.subheader("Current Extracted Terms and Definitions")
+    st.json(st.session_state["all_terms"])
+with upload_tab:
+    st.subheader("Extract and Query Definitions")
+    if st.button("Initialize Index and Reset Terms", key="init_index_1"):
+        st.session_state["llama_index"] = initialize_index(
+            llm_name, model_temperature, api_key
+        )
+        st.session_state["all_terms"] = DEFAULT_TERMS
+    if "llama_index" in st.session_state:
+        st.markdown(
+            "Either upload an image/screenshot of a document, or enter the text manually."
+        )
+        uploaded_file = st.file_uploader(
+            "Upload an image/screenshot of a document:", type=["png", "jpg", "jpeg"]
+        )
+        document_text = st.text_area("Or enter raw text")
+        if st.button("Extract Terms and Definitions") and (
+            uploaded_file or document_text
+        ):
+            st.session_state["terms"] = {}
+            terms_docs = {}
+            with st.spinner("Extracting (images may be slow)..."):
+                if document_text:
+                    terms_docs.update(
+                        extract_terms(
+                            [Document(document_text)],
+                            term_extract_str,
+                            llm_name,
+                            model_temperature,
+                            api_key,
+                        )
+                    )
+                if uploaded_file:
+                    Image.open(uploaded_file).convert("RGB").save("temp.png")
+                    img_reader = SimpleDirectoryReader(
+                        input_files=["temp.png"], file_extractor=file_extractor
+                    )
+                    img_docs = img_reader.load_data()
+                    os.remove("temp.png")
+                    terms_docs.update(
+                        extract_terms(
+                            img_docs,
+                            term_extract_str,
+                            llm_name,
+                            model_temperature,
+                            api_key,
+                        )
+                    )
+            st.session_state["terms"].update(terms_docs)
+    if "terms" in st.session_state and st.session_state["terms"]:
+        st.markdown("Extracted terms")
+        st.json(st.session_state["terms"])
+        if st.button("Insert terms?"):
+            with st.spinner("Inserting terms"):
+                insert_terms(st.session_state["terms"])
+            st.session_state["all_terms"].update(st.session_state["terms"])
+            st.session_state["terms"] = {}
+            st.experimental_rerun()
+with query_tab:
+    st.subheader("Query for Terms/Definitions!")
+    st.markdown(
+        (
+            "The LLM will attempt to answer your query, and augment it's answers using the terms/definitions you've inserted. "
+            "If a term is not in the index, it will answer using it's internal knowledge."
+        )
+    )
+    if st.button("Initialize Index and Reset Terms", key="init_index_2"):
+        st.session_state["llama_index"] = initialize_index(
+            llm_name, model_temperature, api_key
+        )
+        st.session_state["all_terms"] = DEFAULT_TERMS
+    if "llama_index" in st.session_state:
+        query_text = st.text_input("Ask about a term or definition:")
+        if query_text:
+            with st.spinner("Generating answer..."):
+                response = st.session_state["llama_index"].query(
+                    query_text, similarity_top_k=5, response_mode="compact",
+                    text_qa_template=TEXT_QA_TEMPLATE, refine_template=REFINE_TEMPLATE
+                )
+            st.markdown(str(response))

constants.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
+from langchain.prompts.chat import (
+    AIMessagePromptTemplate,
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
+# Text QA templates
+DEFAULT_TEXT_QA_PROMPT_TMPL = (
+    "Context information is below. \n"
+    "---------------------\n"
+    "{context_str}"
+    "\n---------------------\n"
+    "Given the context information answer the following question "
+    "(if you don't know the answer, use the best of your knowledge): {query_str}\n"
+)
+TEXT_QA_TEMPLATE = QuestionAnswerPrompt(DEFAULT_TEXT_QA_PROMPT_TMPL)
+# Refine templates
+DEFAULT_REFINE_PROMPT_TMPL = (
+    "The original question is as follows: {query_str}\n"
+    "We have provided an existing answer: {existing_answer}\n"
+    "We have the opportunity to refine the existing answer "
+    "(only if needed) with some more context below.\n"
+    "------------\n"
+    "{context_msg}\n"
+    "------------\n"
+    "Given the new context and using the best of your knowledge, improve the existing answer. "
+    "If you can't improve the existing answer, just repeat it again. "
+    "Do not mention that you've read the above context."
+)
+DEFAULT_REFINE_PROMPT = RefinePrompt(DEFAULT_REFINE_PROMPT_TMPL)
+CHAT_REFINE_PROMPT_TMPL_MSGS = [
+    HumanMessagePromptTemplate.from_template("{query_str}"),
+    AIMessagePromptTemplate.from_template("{existing_answer}"),
+    HumanMessagePromptTemplate.from_template(
+        "We have the opportunity to refine the above answer "
+        "(only if needed) with some more context below.\n"
+        "------------\n"
+        "{context_msg}\n"
+        "------------\n"
+        "Given the new context and using the best of your knowledge, improve the existing answer. "
+        "If you can't improve the existing answer, just repeat it again. "
+        "Do not mention that you've read the above context."
+    ),
+]
+CHAT_REFINE_PROMPT_LC = ChatPromptTemplate.from_messages(CHAT_REFINE_PROMPT_TMPL_MSGS)
+CHAT_REFINE_PROMPT = RefinePrompt.from_langchain_prompt(CHAT_REFINE_PROMPT_LC)
+# refine prompt selector
+DEFAULT_REFINE_PROMPT_SEL_LC = ConditionalPromptSelector(
+    default_prompt=DEFAULT_REFINE_PROMPT.get_langchain_prompt(),
+    conditionals=[(is_chat_model, CHAT_REFINE_PROMPT.get_langchain_prompt())],
+)
+REFINE_TEMPLATE = RefinePrompt(
+    langchain_prompt_selector=DEFAULT_REFINE_PROMPT_SEL_LC
+)
+DEFAULT_TERM_STR = (
+    "Make a list of terms and definitions that are defined in the context, "
+    "with one pair on each line. "
+    "If a term is missing it's definition, use your best judgment. "
+    "Write each line as as follows:\nTerm: <term> Definition: <definition>"
+)
+DEFAULT_TERMS = {
+    "New York City": "The most populous city in the United States, located at the southern tip of New York State, and the largest metropolitan area in the U.S. by both population and urban area.",
+    "boroughs": "Five administrative divisions of New York City, each coextensive with a respective county of the state of New York: Brooklyn, Queens, Manhattan, The Bronx, and Staten Island.",
+    "metropolitan statistical area": "A geographical region with a relatively high population density at its core and close economic ties throughout the area.",
+    "combined statistical area": "A combination of adjacent metropolitan and micropolitan statistical areas in the United States and Puerto Rico that can demonstrate economic or social linkage.",
+    "megacities": "A city with a population of over 10 million people.",
+    "United Nations": "An intergovernmental organization that aims to maintain international peace and security, develop friendly relations among nations, achieve international cooperation, and be a center for harmonizing the actions of nations.",
+    "Pulitzer Prizes": "A series of annual awards for achievements in journalism, literature, and musical composition in the United States.",
+    "Times Square": "A major commercial and tourist destination in Manhattan, New York City.",
+    "New Netherland": "A Dutch colony in North America that existed from 1614 until 1664.",
+    "Dutch West India Company": "A Dutch trading company that operated as a monopoly in New Netherland from 1621 until 1639-1640.",
+    "patroon system": "A system instituted by the Dutch to attract settlers to New Netherland, whereby wealthy Dutchmen who brought 50 colonists would be awarded land and local political autonomy.",
+    "Peter Stuyvesant": "The last Director-General of New Netherland, who served from 1647 until 1664.",
+    "Treaty of Breda": "A treaty signed in 1667 between the Dutch and English that resulted in the Dutch keeping Suriname and the English keeping New Amsterdam (which was renamed New York).",
+    "African Burying Ground": "A cemetery discovered in Foley Square in the 1990s that included 10,000 to 20,000 graves of colonial-era Africans, some enslaved and some free.",
+    "Stamp Act Congress": "A meeting held in New York in 1765 in response to the Stamp Act, which imposed taxes on printed materials in the American colonies.",
+    "Battle of Long Island": "The largest battle of the American Revolutionary War, fought on August 27, 1776, in Brooklyn, New York City.",
+    "New York Police Department": "The police force of New York City.",
+    "Irish immigrants": "People who immigrated to the United States from Ireland.",
+    "lynched": "To kill someone, especially by hanging, without a legal trial.",
+    "civil unrest": "A situation in which people in a country are angry and likely to protest or fight.",
+    "megacity": "A very large city, typically one with a population of over ten million people.",
+    "World Trade Center": "A complex of buildings in Lower Manhattan, New York City, that were destroyed in the September 11 attacks.",
+    "COVID-19": "A highly infectious respiratory illness caused by the SARS-CoV-2 virus.",
+    "monkeypox outbreak": "An outbreak of a viral disease similar to smallpox, which occurred in the LGBT community in New York City in 2022.",
+    "Hudson River": "A river in the northeastern United States, flowing from the Adirondack Mountains in New York into the Atlantic Ocean.",
+    "estuary": "A partly enclosed coastal body of brackish water with one or more rivers or streams flowing into it, and with a free connection to the open sea.",
+    "East River": "A tidal strait in New York City.",
+    "Five Boroughs": "Refers to the five counties that make up New York City: Bronx, Brooklyn, Manhattan, Queens, and Staten Island.",
+    "Staten Island": "The most suburban of the five boroughs, located southwest of Manhattan and connected to it by the free Staten Island Ferry.",
+    "Todt Hill": "The highest point on the eastern seaboard south of Maine, located on Staten Island.",
+    "Manhattan": "The geographically smallest and most densely populated borough of New York City, known for its skyscrapers, Central Park, and cultural, administrative, and financial centers.",
+    "Brooklyn": "The most populous borough of New York City, located on the western tip of Long Island and known for its cultural diversity, independent art scene, and distinctive neighborhoods.",
+    "Queens": "The largest borough of New York City, located on Long Island north and east of Brooklyn, and known for its ethnic diversity, commercial and residential prominence, and hosting of the annual U.S. Open tennis tournament.",
+    "The Bronx": "The northernmost borough of New York",
+}

index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

nyc_wiki.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+langchain==0.0.128
+llama-index==0.5.4
+Pillow==9.4.0
+streamlit==1.19.0

terms_definitions_tutorial.md ADDED Viewed

	@@ -0,0 +1,494 @@

+# Llama Index Problem Solving - Extracting Terms and Definitions
+Llama Index has many use cases (semantic search, summarization, etc.) that are [well documented](https://gpt-index.readthedocs.io/en/latest/use_cases/queries.html). However, this doesn't mean we can't apply Llama Index to very specific use cases!
+In this tutorial, we will go through the design process of using Llama Index to extract terms and definitions from text, while allowing users to query those terms later. Using [Streamlit](https://streamlit.io/), we can provide an easy to build frontend for running and testing all of this, and quickly iterate with our design.
+This tutorial assumes you have the following packages installed:
+- python3.9+
+- llama_index
+- streamlit
+At the base level, our objective is to take text from a document, extract terms and definitions, and then provide a way for users to query that knowledge base of terms and definitions. The tutorial will go over features from both Llama Index and Streamlit, and hopefully provide some interesting solutions for common problems that come up.
+The final version of this tutorial can be found [here](https://github.com/logan-markewich/llama_index_starter_pack).
+## Uploading Text
+So step one is giving users a way to upload documents. Let’s write some code using Streamlit to provide the interface for this! Use the following code and launch the app with `streamlit run app.py`.
+```python
+import streamlit as st
+st.title("🦙 Llama Index Term Extractor 🦙")
+document_text = st.text_area("Or enter raw text")
+if st.button("Extract Terms and Definitions") and document_text:
+    with st.spinner("Extracting..."):
+        extracted_terms = document text  # this is a placeholder!
+    st.write(extracted_terms)
+```
+Super simple right! But you'll notice that the app doesn't do anything useful yet. To use llama_index, we also need to setup our OpenAI LLM. There are a bunch of possible settings for the LLM, so we can let the user figure out what's best. We should also let the user set the prompt that will extract the terms (which will also help us debug what works best).
+## LLM Settings
+This next step introduces some tabs to our app, to separate it into different panes that provide different features. Let's create a tab for LLM settings and for uploading text:
+```python
+import os
+import streamlit as st
+DEFAULT_TERM_STR = (
+    "Make a list of terms and definitions that are defined in the context, "
+    "with one pair on each line. "
+    "If a term is missing it's definition, use your best judgment. "
+    "Write each line as as follows:\nTerm: <term> Definition: <definition>"
+)
+st.title("🦙 Llama Index Term Extractor 🦙")
+setup_tab, upload_tab = st.tabs(["Setup", "Upload/Extract Terms"])
+with setup_tab:
+    st.subheader("LLM Setup")
+    api_key = st.text_input("Enter your OpenAI API key here", type="password")
+    llm_name = st.selectbox('Which LLM?', ["text-davinci-003", "gpt-3.5-turbo", "gpt-4"])
+    model_temperature = st.slider("LLM Temperature", min_value=0.0, max_value=1.0, step=0.1)
+    term_extract_str = st.text_area("The query to extract terms and definitions with.", value=DEFAULT_TERM_STR)
+with upload_tab:
+    st.subheader("Extract and Query Definitions")
+    document_text = st.text_area("Or enter raw text")
+    if st.button("Extract Terms and Definitions") and document_text:
+        with st.spinner("Extracting..."):
+            extracted_terms = document text  # this is a placeholder!
+        st.write(extracted_terms)
+```
+Now our app has two tabs, which really helps with the organization. You'll also noticed I added a default prompt to extract terms -- you can change this later once you try extracting some terms, it's just the prompt I arrived at after experimenting a bit.
+Speaking of extracting terms, it's time to add some functions to do just that!
+## Extracting and Storing Terms
+Now that we are able to define LLM settings and upload text, we can try using Llama Index to extract the terms from text for us!
+We can add the following functions to both initialize our LLM, as well as use it to extract terms from the input text.
+```python
+from llama_index import Document, GPTListIndex, LLMPredictor, ServiceContext, PromptHelper
+def get_llm(llm_name, model_temperature, api_key, max_tokens=256):
+    os.environ['OPENAI_API_KEY'] = api_key
+    if llm_name == "text-davinci-003":
+        return OpenAI(temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens)
+    else:
+        return ChatOpenAI(temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens)
+def extract_terms(documents, term_extract_str, llm_name, model_temperature, api_key):
+    llm = get_llm(llm_name, model_temperature, api_key, max_tokens=1024)
+    service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm),
+                                                   prompt_helper=PromptHelper(max_input_size=4096,
+                                                                              max_chunk_overlap=20,
+                                                                              num_output=1024),
+                                                   chunk_size_limit=1024)
+    temp_index = GPTListIndex.from_documents(documents, service_context=service_context)
+    terms_definitions = str(temp_index.query(term_extract_str, response_mode="tree_summarize"))
+    terms_definitions = [x for x in terms_definitions.split("\n") if x and 'Term:' in x and 'Definition:' in x]
+    # parse the text into a dict
+    terms_to_definition = {x.split("Definition:")[0].split("Term:")[-1].strip(): x.split("Definition:")[-1].strip() for x in terms_definitions}
+    return terms_to_definition
+```
+Now, using the new functions, we can finally extract our terms!
+```python
+...
+with upload_tab:
+    st.subheader("Extract and Query Definitions")
+    document_text = st.text_area("Or enter raw text")
+    if st.button("Extract Terms and Definitions") and document_text:
+        with st.spinner("Extracting..."):
+            extracted_terms = extract_terms([Document(document_text)],
+                                            term_extract_str, llm_name,
+                                            model_temperature, api_key)
+        st.write(extracted_terms)
+```
+There's a lot going on now, so let's take a moment to go over what is happening.
+`get_llm()` is instantiating the LLM based on the user configuration from the setup tab. Based on the model name, we need to use the appropriate class (`OpenAI` vs. `ChatOpenAI`).
+`extract_terms()` is where all the good stuff happens. First, we call `get_llm()` with `max_tokens=1024`, since we don't want to limit the model too much when it is extracting our terms and definitions (the default is 256 if not set). Then, we define our `ServiceContext` object, aligning `num_output` with our `max_tokens` value, as well as setting the chunk size to be no larger than the output. When documents are indexed by Llama Index, they are broken into chunks (also called nodes) if they are large, and `chunk_size_limit` sets the maximum size for these chunks.
+Next, we create a temporary list index and pass in our service context. A list index will read every single piece of text in our index, which is perfect for extracting terms. Finally, we use are pre-define query text to extract terms, using `response_mode="tree_summarize`. This response mode will generate a tree of summaries from the bottom up, where each parent summarizes its children. Finally, the top of the tree is returned, which will contain all our extracted terms and definitions.
+Lastly, we do some minor post processing. We assume the model followed instructions and put a term/definition pair on each line. If a line is missing the `Term:` or `Definition:` labels, we skip it. Then, we convert this to a dictionary for easy storage!
+## Saving Extracted Terms
+Now that we can extract terms, we need to put them somewhere so that we can query for them later. A `GPTSimpleVectorIndex` should be a perfect choice for now! But in addition, our app should also keep track of which terms are inserted into the index so that we can inspect them later. Using `st.session_state`, we can store the current list of terms in a session dict, unique to each user!
+First things first though, let's add a feature to initialize a global vector index and another function to insert the extracted terms.
+```python
+...
+if 'all_terms' not in st.session_state:
+    st.session_state['all_terms'] = DEFAULT_TERMS
+...
+def insert_terms(terms_to_definition):
+    for term, definition in terms_to_definition.items():
+        doc = Document(f"Term: {term}\nDefinition: {definition}")
+        st.session_state['llama_index'].insert(doc)
+@st.cache_resource
+def initialize_index(llm_name, model_temperature, api_key):
+    """Create the GPTSQLStructStoreIndex object."""
+    llm = get_llm(llm_name, model_temperature, api_key)
+    service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
+    index = GPTSimpleVectorIndex([], service_context=service_context)
+    return index
+...
+with upload_tab:
+    st.subheader("Extract and Query Definitions")
+    if st.button("Initialize Index and Reset Terms"):
+        st.session_state['llama_index'] = initialize_index(llm_name, model_temperature, api_key)
+        st.session_state['all_terms'] = {}
+    if "llama_index" in st.session_state:
+        st.markdown("Either upload an image/screenshot of a document, or enter the text manually.")
+        document_text = st.text_area("Or enter raw text")
+        if st.button("Extract Terms and Definitions") and (uploaded_file or document_text):
+            st.session_state['terms'] = {}
+            terms_docs = {}
+            with st.spinner("Extracting..."):
+                terms_docs.update(extract_terms([Document(document_text)], term_extract_str, llm_name, model_temperature, api_key))
+            st.session_state['terms'].update(terms_docs)
+        if "terms" in st.session_state and st.session_state["terms"]::
+            st.markdown("Extracted terms")
+            st.json(st.session_state['terms'])
+            if st.button("Insert terms?"):
+                with st.spinner("Inserting terms"):
+                    insert_terms(st.session_state['terms'])
+                st.session_state['all_terms'].update(st.session_state['terms'])
+                st.session_state['terms'] = {}
+                st.experimental_rerun()
+```
+Now you are really starting to leverage the power of streamlit! Let's start with the code under the upload tab. We added a button to initialize the vector index, and we store it in the global streamlit state dictionary, as well as resetting the currently extracted terms. Then, after extracting terms from the input text, we store it the extracted terms in the global state again and give the user a chance to review them before inserting. If the insert button is pressed, then we call our insert terms function, update our global tracking of inserted terms, and remove the most recently extracted terms from the session state.
+## Querying for Extracted Terms/Definitions
+With the terms and definitions extracted and saved, how can we use them? And how will the user even remember what's previously been saved?? We can simply add some more tabs to the app to handle these features.
+```python
+...
+setup_tab, terms_tab, upload_tab, query_tab = st.tabs(
+    ["Setup", "All Terms", "Upload/Extract Terms", "Query Terms"]
+)
+...
+with terms_tab:
+    with terms_tab:
+    st.subheader("Current Extracted Terms and Definitions")
+    st.json(st.session_state["all_terms"])
+...
+with query_tab:
+    st.subheader("Query for Terms/Definitions!")
+    st.markdown(
+        (
+            "The LLM will attempt to answer your query, and augment it's answers using the terms/definitions you've inserted. "
+            "If a term is not in the index, it will answer using it's internal knowledge."
+        )
+    )
+    if st.button("Initialize Index and Reset Terms", key="init_index_2"):
+        st.session_state["llama_index"] = initialize_index(
+            llm_name, model_temperature, api_key
+        )
+        st.session_state["all_terms"] = {}
+    if "llama_index" in st.session_state:
+        query_text = st.text_input("Ask about a term or definition:")
+        if query_text:
+            query_text = query_text + "\nIf you can't find the answer, answer the query with the best of your knowledge."
+            with st.spinner("Generating answer..."):
+                response = st.session_state["llama_index"].query(
+                    query_text, similarity_top_k=5, response_mode="compact"
+                )
+            st.markdown(str(response))
+```
+While this is mostly basic, some important things to note:
+- Our initialize button has the same text as our other button. Streamlit will complain about this, so we provide a unique key instead.
+- Some additional text has been added to the query! This is to try and compensate for times when the index does not have the answer.
+- In our index query, we've specified two options:
+  - `similarity_top_k=5` means the index will fetch the top 5 closest matching terms/definitions to the query.
+  - `response_mode="compact"` means as much text as possible from the 5 matching terms/definitions will be used in each LLM call. Without this, the index would make at least 5 calls to the LLM, which can slow things down for the user.
+## Dry Run Test
+Well, actually I hope you've been testing as we went. But now, let's try one complete test.
+1. Refresh the app
+2. Enter your LLM settings
+3. Head over to the query tab
+4. Ask the following: `What is a bunnyhug?`
+5. The app should give some nonsense response. If you didn't know, a bunnyhug is another word for a hoodie, used by people from the Canadian Prairies!
+6. Let's add this definition to the app. Open the upload tab and enter the following text: `A bunnyhug is a common term used to describe a hoodie. This term is used by people from the Canadian Prairies.`
+7. Click the extract button. After a few moments, the app should display the correctly extracted term/definition. Click the insert term button to save it!
+8. If we open the terms tab, the term and definition we just extracted should be displayed
+9. Go back to the query tab and try asking what a bunnyhug is. Now, the answer should be correct!
+## Improvement #1 - Create a Starting Index
+With our base app working, it might feel like a lot of work to build up a useful index. What if we gave the user some kind of starting point to show off the app's query capabilities? We can do just that! First, let's make a small change to our app so that we save the index to disk after every upload:
+```python
+def insert_terms(terms_to_definition):
+    for term, definition in terms_to_definition.items():
+        doc = Document(f"Term: {term}\nDefinition: {definition}")
+        st.session_state['llama_index'].insert(doc)
+    # TEMPORARY - save to disk
+    st.session_state['llama_index'].save_to_disk("index.json")
+```
+Now, we need some document to extract from! The repository for this project used the wikipedia page on New York City, and you can find the text [here](https://github.com/jerryjliu/llama_index/blob/main/examples/test_wiki/data/nyc_text.txt).
+If you paste the text into the upload tab and run it (it may take some time), we can insert the extracted terms. Make sure to also copy the text for the extracted terms into a notepad or similar before inserting into the index! We will need them in a second.
+After inserting, remove the line of code we used to save the index to disk. With a starting index now saved, we can modify our `initialize_index` function to look like this:
+```python
+@st.cache_resource
+def initialize_index(llm_name, model_temperature, api_key):
+    """Create the GPTSQLStructStoreIndex object."""
+    llm = get_llm(llm_name, model_temperature, api_key)
+    service_context = ServiceContext.from_defaults(llm_predictor=LLMPredictor(llm=llm))
+    index = GPTSimpleVectorIndex.load_from_disk(
+        "./index.json", service_context=service_context
+    )
+    return index
+```
+Did you remember to save that giant list of extracted terms in a notepad? Now when our app initializes, we want to pass in the default terms that are in the index to our global terms state:
+```python
+...
+if "all_terms" not in st.session_state:
+    st.session_state["all_terms"] = DEFAULT_TERMS
+...
+```
+Repeat the above anywhere where we were previously resetting the `all_terms` values.
+## Improvement #2 - (Refining) Better Prompts
+If you play around with the app a bit now, you might notice that it stopped following our prompt! Remember, we added to our `query_str` variable that if the term/definition could not be found, answer to the best of it's knowledge. But now if you try asking about random terms (like bunnyhug!), it may or may not follow those instructions.
+This is due to the concept of "refining" answers in Llama Index. Since we are querying across the top 5 matching results, sometimes all the results do not fit in a single prompt! OpenAI models typically have a max input size of 4097 tokens. So, Llama Index accounts for this by breaking up the matching results into chunks that will fit into the prompt. After Llama Index gets an initial answer from the first API call, it sends the next chunk to the API, along with the previous answer, and asks the model to refine that answer.
+So, the refine process seems to be messing with our results! Rather than appending extra instructions to the `query_str`, remove that, and Llama Index will let us provide our own custom prompts! Let's create those now, using the [default prompts](https://github.com/jerryjliu/llama_index/blob/main/gpt_index/prompts/default_prompts.py) and [chat specific prompts](https://github.com/jerryjliu/llama_index/blob/main/gpt_index/prompts/chat_prompts.py) as a guide. Using a new file `constants.py`, let's create some new query templates:
+```python
+from langchain.chains.prompt_selector import ConditionalPromptSelector, is_chat_model
+from langchain.prompts.chat import (
+    AIMessagePromptTemplate,
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from gpt_index.prompts.prompts import QuestionAnswerPrompt, RefinePrompt
+# Text QA templates
+DEFAULT_TEXT_QA_PROMPT_TMPL = (
+    "Context information is below. \n"
+    "---------------------\n"
+    "{context_str}"
+    "\n---------------------\n"
+    "Given the context information answer the following question "
+    "(if you don't know the answer, use the best of your knowledge): {query_str}\n"
+)
+TEXT_QA_TEMPLATE = QuestionAnswerPrompt(DEFAULT_TEXT_QA_PROMPT_TMPL)
+# Refine templates
+DEFAULT_REFINE_PROMPT_TMPL = (
+    "The original question is as follows: {query_str}\n"
+    "We have provided an existing answer: {existing_answer}\n"
+    "We have the opportunity to refine the existing answer "
+    "(only if needed) with some more context below.\n"
+    "------------\n"
+    "{context_msg}\n"
+    "------------\n"
+    "Given the new context and using the best of your knowledge, improve the existing answer. "
+    "If you can't improve the existing answer, just repeat it again."
+)
+DEFAULT_REFINE_PROMPT = RefinePrompt(DEFAULT_REFINE_PROMPT_TMPL)
+CHAT_REFINE_PROMPT_TMPL_MSGS = [
+    HumanMessagePromptTemplate.from_template("{query_str}"),
+    AIMessagePromptTemplate.from_template("{existing_answer}"),
+    HumanMessagePromptTemplate.from_template(
+        "We have the opportunity to refine the above answer "
+        "(only if needed) with some more context below.\n"
+        "------------\n"
+        "{context_msg}\n"
+        "------------\n"
+        "Given the new context and using the best of your knowledge, improve the existing answer. "
+    "If you can't improve the existing answer, just repeat it again."
+    ),
+]
+CHAT_REFINE_PROMPT_LC = ChatPromptTemplate.from_messages(CHAT_REFINE_PROMPT_TMPL_MSGS)
+CHAT_REFINE_PROMPT = RefinePrompt.from_langchain_prompt(CHAT_REFINE_PROMPT_LC)
+# refine prompt selector
+DEFAULT_REFINE_PROMPT_SEL_LC = ConditionalPromptSelector(
+    default_prompt=DEFAULT_REFINE_PROMPT.get_langchain_prompt(),
+    conditionals=[(is_chat_model, CHAT_REFINE_PROMPT.get_langchain_prompt())],
+)
+REFINE_TEMPLATE = RefinePrompt(
+    langchain_prompt_selector=DEFAULT_REFINE_PROMPT_SEL_LC
+)
+```
+So that seems like a lot of code, but it's not too bad! If you looked at the default prompts, you might have noticed that there are default prompts, and prompts specific to chat models. Continuing that trend, we do the same for our custom prompts. Then, using a prompt selector, we can combine both prompts into a single object. If the LLM being used is a chat model (ChatGPT, GPT-4), then the chat prompts are used. Otherwise, use the normal prompt templates.
+Another thing to note is that we only defined one QA template. In a chat model, this will be converted to a single "human" message.
+So, now we can import these prompts into our app and use them during the query.
+```python
+from constants import REFINE_TEMPLATE, TEXT_QA_TEMPLATE
+...
+    if "llama_index" in st.session_state:
+        query_text = st.text_input("Ask about a term or definition:")
+        if query_text:
+            query_text = query_text  # Notice we removed the old instructions
+            with st.spinner("Generating answer..."):
+                response = st.session_state["llama_index"].query(
+                    query_text, similarity_top_k=5, response_mode="compact",
+                    text_qa_template=TEXT_QA_TEMPLATE, refine_template=REFINE_TEMPLATE
+                )
+            st.markdown(str(response))
+...
+```
+If you experiment a bit more with queries, hopefully you notice that the responses follow our instructions a little better now!
+## Improvement #3 - Image Support
+Llama index also supports images! Using Llama Index, we can upload images of documents (papers, letters, etc.), and Llama Index handles extracting the text. We can leverage this to also allow users to upload images of their documents and extract terms and definitions from them.
+If you get an import error about PIL, install it using `pip install Pillow` first.
+```python
+from PIL import Image
+from llama_index.readers.file.base import DEFAULT_FILE_EXTRACTOR, ImageParser
+@st.cache_resource
+def get_file_extractor():
+    image_parser = ImageParser(keep_image=True, parse_text=True)
+    file_extractor = DEFAULT_FILE_EXTRACTOR
+    file_extractor.update(
+        {
+            ".jpg": image_parser,
+            ".png": image_parser,
+            ".jpeg": image_parser,
+        }
+    )
+    return file_extractor
+file_extractor = get_file_extractor()
+...
+with upload_tab:
+    st.subheader("Extract and Query Definitions")
+    if st.button("Initialize Index and Reset Terms", key="init_index_1"):
+        st.session_state["llama_index"] = initialize_index(
+            llm_name, model_temperature, api_key
+        )
+        st.session_state["all_terms"] = DEFAULT_TERMS
+    if "llama_index" in st.session_state:
+        st.markdown(
+            "Either upload an image/screenshot of a document, or enter the text manually."
+        )
+        uploaded_file = st.file_uploader(
+            "Upload an image/screenshot of a document:", type=["png", "jpg", "jpeg"]
+        )
+        document_text = st.text_area("Or enter raw text")
+        if st.button("Extract Terms and Definitions") and (
+            uploaded_file or document_text
+        ):
+            st.session_state["terms"] = {}
+            terms_docs = {}
+            with st.spinner("Extracting (images may be slow)..."):
+                if document_text:
+                    terms_docs.update(
+                        extract_terms(
+                            [Document(document_text)],
+                            term_extract_str,
+                            llm_name,
+                            model_temperature,
+                            api_key,
+                        )
+                    )
+                if uploaded_file:
+                    Image.open(uploaded_file).convert("RGB").save("temp.png")
+                    img_reader = SimpleDirectoryReader(
+                        input_files=["temp.png"], file_extractor=file_extractor
+                    )
+                    img_docs = img_reader.load_data()
+                    os.remove("temp.png")
+                    terms_docs.update(
+                        extract_terms(
+                            img_docs,
+                            term_extract_str,
+                            llm_name,
+                            model_temperature,
+                            api_key,
+                        )
+                    )
+            st.session_state["terms"].update(terms_docs)
+        if "terms" in st.session_state and st.session_state["terms"]:
+            st.markdown("Extracted terms")
+            st.json(st.session_state["terms"])
+            if st.button("Insert terms?"):
+                with st.spinner("Inserting terms"):
+                    insert_terms(st.session_state["terms"])
+                st.session_state["all_terms"].update(st.session_state["terms"])
+                st.session_state["terms"] = {}
+                st.experimental_rerun()
+```
+Here, we added the option to upload a file using Streamlit. Then the image is opened and saved to disk (this seems hacky but it keeps things simple). Then we pass the image path to the reader, extract the documents/text, and remove our temp image file.
+Now that we have the documents, we can call `extract_terms()` the same as before.
+## Conclusion/TLDR
+In this tutorial, we covered a ton of information, while solving some common issues and problems along the way:
+- Using different indexes for different use cases (List vs. Vector index)
+- Storing global state values with Streamlit's `session_state` concept
+- Customizing internal prompts with Llama Index
+- Reading text from images with Llama Index
+The final version of this tutorial can be found [here](https://github.com/logan-markewich/llama_index_starter_pack).

utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+from langchain import OpenAI
+from langchain.chat_models import ChatOpenAI
+def get_llm(llm_name, model_temperature, api_key, max_tokens=256):
+    os.environ["OPENAI_API_KEY"] = api_key
+    if llm_name == "text-davinci-003":
+        return OpenAI(
+            temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens
+        )
+    else:
+        return ChatOpenAI(
+            temperature=model_temperature, model_name=llm_name, max_tokens=max_tokens
+        )