import streamlit as st from streamlit.logger import get_logger from timeit import default_timer as timer import sqlite3 import pandas as pd LOGGER = get_logger(__name__) def preprocess(s:str)->str: return s.replace('"','').replace('על','').replace('פרק','').replace('פסוק','').replace('דף','').replace('עמוד','').replace('סימן','').replace('סעיף','').replace('חידושי','').replace("'",'') @st.cache_resource def get_dfs()->object: print('hello from get_dfs..') # //get the books table// # Connect to the database conn = sqlite3.connect('test42.db') # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM titles") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame titles = pd.DataFrame(results) titles.columns=list(map(lambda x: x[0], cursor.description)) # //get the texts table// # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM texts") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame texts = pd.DataFrame(results) texts.columns=list(map(lambda x: x[0], cursor.description)) # //get the references database # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM refs") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame refs = pd.DataFrame(results) refs.columns=list(map(lambda x: x[0], cursor.description)) # Query the database and retrieve the results cursor = conn.execute("SELECT * FROM books") results = cursor.fetchall() # Convert the query results into a Pandas DataFrame books = pd.DataFrame(list(results)) books.columns=list(map(lambda x: x[0], cursor.description)) #merge the books and refs with the texts merged = pd.merge(texts,books,how='inner',left_on='bid',right_on='_id') texts_df = pd.merge(merged,refs,left_on='_id_x',right_on='tid') titles_df = titles return titles_df, texts_df def find_ref(titles_df,texts_df,input_text,top_k,num_of_results,algorithm): from rapidfuzz import fuzz, process as rapidfuzz_process from rapidfuzz.fuzz import token_ratio,ratio,partial_ratio,token_set_ratio,partial_token_set_ratio,token_sort_ratio, WRatio print('hello from find_ref..') if not input_text: return print(eval(algorithm)) results = [] books = titles_df['he_titles'] input_text = input_text.replace(':','עמוד ב').replace('.','עמוד א') scorer = eval(algorithm) # search only the references database in case the user set the top_k to 0 if top_k == 0: refs = texts_df['ref_text_long'].unique() for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs,scorer=scorer, limit=num_of_results,processor=preprocess): results += [{'ref':ref,'ref_score':ref_score}] else: # search first only in the books database (for top_k books) for book, book_score, _ in rapidfuzz_process.extract(input_text, books, scorer=scorer, limit=top_k,processor=preprocess): # get all the references of that book book_title = list(titles_df.loc[titles_df['he_titles']==book]['title'])[0] refs = texts_df.loc[texts_df['title']==book_title]['ref_text_long'].unique() # then search these references and add them all to the results for ref, ref_score, _ in rapidfuzz_process.extract(input_text, refs, limit=10,scorer=scorer,processor=preprocess): results += [{'ref':ref,'ref_score':ref_score,'book':book,'book_score':book_score}] # finaly, sort all the references by their own score (and not the book score) results.sort(key=lambda x: x['ref_score'],reverse=True) return results[:num_of_results] def run(): st.set_page_config( page_title=" חיפוש מקורות", page_icon="📚", layout="wide", initial_sidebar_state="expanded" ) get_dfs() st.write("# חיפוש מקורות באמצעות מרחק לוינשטיין") titles_df,texts_df = get_dfs() user_input = st.text_input('כתוב את המקור המבוקש', placeholder='בבא קמא דף ב עמוד ב') top_k = st.sidebar.slider('כמה ספרים לסרוק top_k:',0,20,10) num_of_results = st.sidebar.slider('מספר התוצאות שברצונך להציג:',1,25,5) algorithm = st.sidebar.selectbox('האלגוריתם לדירוג התוצאות',['token_ratio','ratio','WRatio','partial_ratio','token_set_ratio','partial_token_set_ratio','token_sort_ratio']) if user_input!="": time0 = timer() results = find_ref(titles_df,texts_df,user_input,top_k,num_of_results,algorithm) time = f"finished in {1e3*(timer()-time0):.1f} ms" st.write(time) buttons = [] for i, result in enumerate(results): st.write(result) buttons.append(st.button("פתח " +result['ref'],i)) if buttons[i]: st.write(texts_df.loc[texts_df['ref_text_long']==result['ref']][['heText','ref_text_long']]) if __name__ == "__main__": run()