from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np import pickle from tqdm import tqdm from functools import partial from multiprocessing import Pool # Load pre-trained model model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Load data with open('data.pickle', 'rb') as file: data = pickle.load(file) # Define a function to compute similarity for a pair of sentences def compute_similarity(model, source_sentence, target_sentence): embedding_1 = model.encode(source_sentence, convert_to_tensor=True) embedding_2 = model.encode(target_sentence, convert_to_tensor=True) similarity = util.pytorch_cos_sim(embedding_1, embedding_2) return similarity.item() # Define a function to compute similarities for a given source sentence def compute_similarities_for_source(model, source_sentence, data): source_index = data.index(source_sentence) similarities = [compute_similarity(model, source_sentence['description'], data[index]['description']) for index in tqdm(range(source_index, len(data)), desc=f"Computing similarities for '{source_sentence['description']}'")] return similarities # Define a function to compute similarities for all sentences in the data def compute_similarities(model, data): with Pool() as pool: func = partial(compute_similarities_for_source, model) similarities = list(tqdm(pool.imap(func, data), total=len(data), desc="Computing similarities")) return similarities # Embed sentences and compute similarities embeddings = model.encode([source_sentence['description'] for source_sentence in data], convert_to_tensor=True) matrix = util.pytorch_cos_sim(embeddings, embeddings).numpy() # Save similarities to CSV file pd.DataFrame(matrix, columns=[source_sentence['description'] for source_sentence in data]).to_csv('data.csv', index=False)