Unit 5: Jaccard Coefficient Calculations¶
Import from scikit learn
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
Test results for jack, mary, jim
jack = ['Y','N','P','N','N','A']
jim = ['Y','N','P','A','P','N']
mary = ['Y','P','N','N','N','A']
One hot encode - convert categorical value to vector format
data = np.array([jack,jim,mary])
# Create OneHotEncoder instance
encoder = OneHotEncoder()
# Fit and transform the data
encoded_data = encoder.fit_transform(data)
jack_vector = encoded_data[0]
jim_vector = encoded_data[1]
mary_vector = encoded_data[2]
Get jaccard score using scikit
jack_mary_jaccard_coefficient = jaccard_score(jack_vector, mary_vector, average='macro')
jack_jim_jaccard_coefficient = jaccard_score(jack_vector, jim_vector, average='macro')
jim_mary_jaccard_coefficient = jaccard_score(jim_vector, mary_vector, average='macro')
jack_mary_jaccard_coefficient, jack_jim_jaccard_coefficient, jim_mary_jaccard_coefficient
(0.36363636363636365, 0.2727272727272727, 0.09090909090909091)