File size: 7,064 Bytes
9f6ab40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import pinecone
import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import BartTokenizer, BartForConditionalGeneration


class BartGenerator:
    def __init__(self, model_name):
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.generator = BartForConditionalGeneration.from_pretrained(model_name)

    def tokenize(self, query, max_length=1024):
        inputs = self.tokenizer([query], max_length=max_length, return_tensors="pt")
        return inputs

    def generate(self, query, min_length=20, max_length=40):
        inputs = self.tokenize(query)
        ids = self.generator.generate(inputs["input_ids"], num_beams=1, min_length=int(min_length), max_length=int(max_length), temperature=int(temperature))
        answer = self.tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        return answer
    
@st.experimental_singleton
def init_models():
    retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base") #("multi-qa-mpnet-base-cos-v1") ("flax-sentence-embeddings/all_datasets_v3_mpnet-base")  
    generator = BartGenerator("vblagoje/bart_lfqa")
    return retriever, generator

PINECONE_KEY = st.secrets["PINECONE_KEY"]

@st.experimental_singleton
def init_pinecone():
    pinecone.init(api_key=PINECONE_KEY, environment="us-west1-gcp")  
    return pinecone.Index("history-qa")

retriever, generator = init_models()
index = init_pinecone()

def display_answer(answer):
    return st.markdown(f"""
    <div class="container-fluid">
        <div class="row align-items-start">
            <div  class="col-md-12 col-sm-12">
                <span style="color: #808080;">
                    {answer}
                </span>
            </div>
        </div>
    </div>
        """, unsafe_allow_html=True)

def display_context(title, context, url):
    return st.markdown(f"""
    <div class="container-fluid">
        <div class="row align-items-start">
            <div  class="col-md-12 col-sm-12">
                <a href={url}>{title}</a>
                <br>
                <span style="color: #808080;">
                    <small>{context}</small>
                </span>
            </div>
        </div>
    </div>
        """, unsafe_allow_html=True)

hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True) 

st.write("""
# Jua Historia Yetu 
### An AI Powered Search Engine for East African History and Tourism!

This is an AI powered system designed to help learn about our history, heroes, cultures and tourist destinations.

The system generates a Human-like response to questions asked and points users to where they 
can get more information on what they would like to know.
It is intended to act as a one-stop search engine for all things East Africa including the people, history, culture, wildlife and tourist destinations.
It can be of use to locals, tourists, students or anyone who would like to learn about The East African Community.
The data is to be sourced from the EAC e-resourse database, member nations' meuseums, archives and relevant tourism bodies.

Once queried, the system generates a short answer that the user can quickly read through and also points the user to
some resources they might find usefull. The user can click on the links to learn more.
""")

st.markdown("""
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.0.0/dist/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
""", unsafe_allow_html=True)

def format_query(query, context):
    context = [f"<P> {m['metadata']['passage_text']}" for m in context]
    context = " ".join(context)
    query = f"question: {query} context: {context}"
    return query

# set parameters
top_k = 5
min_length = 1
max_length = 150
temperature = 3.5

st.sidebar.write("""
## Here are some questions you can try out:
### Copy and paste to test
who was the first person on the moon?\n
Which was the first radio station at Auburn University\n
where is Damastown located\n
What is the Lohanipur Torso \n
when was The Coliseum Theatre opened\n
Who invented the tatoo machine\n
whats th erecipe for Corn chowder\n
when was the Tamil Methodist Church built\n
when was the first electric power system built?\n
How was the first wireless message sent?\n
what was the war of currents?\n
what was NASAs most expensive project?\n
What brands of smokoing paper are manufactured by Miguel y Costas\n
what influenced the naming Holy Forty Martyrs Church\n
When was the world first power system built\n
which is the largest island within the  Halifax Harbour\n
Who was Joseph Monier\n
who were the Karadjordjevic dynasty\n
how many royal tombs were excavated at Tillia Tepe\n 
What did  the HEICO company manufacture\n
tell me about The Battle of Antietam\n
Which was the smallest microbrewery in the United States\n
when did queen marie recieve the bran castle\n
Whe was York Township founded\n
When did the United Nations Security Council  reform the security sector\n
When was Magandang Umaga Po first aired\n
when was Mae Lan District formed\n
what is Voice over Internet Protocol\n
When was InfluxDB developed\n
When was the Semanário Económico newspaper started\n
who owned Kasteln Castle\n
when was The Steinbach Haus built\n
when was the Guerrero ship in Africa\n
tell me about the Guerrero ship\n 
When was the Companhia Paulista de Trens Metropolitanos rilway built\n
When was the lincoln mall demolished\n
where is Damastown located\n
when was solo diving first practiced\n
when was Consumers Credit Union History Consumers Credit Union was founded\n
Who built the castle of Daroynk\n
What is the prime meridian\n
Which was the first radio station at Auburn University\n
What are the origins of feminist music\n
What were the earliest insecticides to be used\n
who were the Drevlians\n
Who were the founders of A.F.C. Euro Kickers\n
when was the camera-on-a-chip developed\n
""")

st.write("If you encounter an error, search again.")
query = st.text_input("Search!", "")

if query != "":
    with st.spinner(text="Wait a sec 🚀🚀🚀"):
        xq = retriever.encode([query]).tolist()
        xc = index.query(xq, top_k=int(top_k), include_metadata=True)
        query = format_query(query, xc["matches"])

    with st.spinner(text="Just a minute ✍️✍️✍️"):
        answer = generator.generate(query, min_length=min_length, max_length=max_length)

    st.write("#### System generated response:")
    display_answer(answer)
    st.write("#### Here are some resources you might find relevant:")

    for m in xc["matches"]:
        title = m["metadata"]["article_title"]
        url = "https://en.wikipedia.org/wiki/" + title.replace(" ", "_")
        context = m["metadata"]["passage_text"]
        display_context(title, context, url)