text-analysis / find_similarity.py
Daryl Fung
initial commit
9b9ea2f
raw
history blame
No virus
2.05 kB
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from functools import partial
from multiprocessing import Pool
# Load pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Load data
with open('data.pickle', 'rb') as file:
data = pickle.load(file)
# Define a function to compute similarity for a pair of sentences
def compute_similarity(model, source_sentence, target_sentence):
embedding_1 = model.encode(source_sentence, convert_to_tensor=True)
embedding_2 = model.encode(target_sentence, convert_to_tensor=True)
similarity = util.pytorch_cos_sim(embedding_1, embedding_2)
return similarity.item()
# Define a function to compute similarities for a given source sentence
def compute_similarities_for_source(model, source_sentence, data):
source_index = data.index(source_sentence)
similarities = [compute_similarity(model,
source_sentence['description'],
data[index]['description']) for index in tqdm(range(source_index, len(data)),
desc=f"Computing similarities for '{source_sentence['description']}'")]
return similarities
# Define a function to compute similarities for all sentences in the data
def compute_similarities(model, data):
with Pool() as pool:
func = partial(compute_similarities_for_source, model)
similarities = list(tqdm(pool.imap(func, data), total=len(data), desc="Computing similarities"))
return similarities
# Embed sentences and compute similarities
embeddings = model.encode([source_sentence['description'] for source_sentence in data], convert_to_tensor=True)
matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy()
# Save similarities to CSV file
pd.DataFrame(matrix, columns=[source_sentence['description'] for source_sentence in data]).to_csv('data.csv', index=False)