File size: 1,223 Bytes
9b9ea2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import csv
import pandas as pd

# read an Excel file into a Pandas dataframe
xls = pd.read_excel('description.xlsx', sheet_name='likely_similar', engine='openpyxl')
xls.iloc[:, 1:]  # select all rows and all columns starting from the second column

# get the names of the source databases from the dataframe
source_databases = xls.iloc[:, 1:].columns

# create a dictionary to hold the similarity data
similarity_dict = {}

# loop through each source database and get the list of similar databases
for source_database in source_databases:
    series = xls.loc[:, source_database]
    similar_databases = series[series != False].values.tolist()
    similarity_dict[source_database] = similar_databases

# find the length of the longest list
max_len = max(len(v) for v in similarity_dict.values())

# pad the shorter lists with NaNs to make them the same length as the longest list
for k, v in similarity_dict.items():
    if len(v) < max_len:
        similarity_dict[k] = v + [float('nan')] * (max_len - len(v))

# convert the dictionary to a Pandas dataframe and transpose it
df = pd.DataFrame.from_dict(similarity_dict)
df = df.transpose()

# write the dataframe to a CSV file
df.to_csv('similarity_dict.csv', index=False)