Need help? Have a feature request? Please check out the
tethne-users group
.
Source code for tethne.analyze.features
"""
Methods for analyzing featuresets.
.. autosummary::
:nosignatures:
cosine_distance
cosine_similarity
distance
kl_divergence
"""
from math import sqrt, log, acos, pi
from tethne.utilities import nonzero
[docs]def kl_divergence(V_a, V_b):
"""
Calculate Kullback-Leibler distance.
Uses the smoothing method described in `Bigi 2003
<http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_
to facilitate better comparisons between vectors describing wordcounts.
Parameters
----------
V_a : list
V_b : list
Returns
-------
divergence : float
KL divergence.
"""
# Find shared features.
Ndiff = _shared_features(V_a, V_b)
# aprob and bprob should each sum to 1.0
aprob = map(lambda v: float(v)/sum(V_a), V_a)
bprob = map(lambda v: float(v)/sum(V_b), V_b)
# Smooth according to Bigi 2003.
aprob, bprob = _smooth(aprob, bprob, Ndiff)
return sum(map(lambda a, b: (a-b)*log(a/b), aprob, bprob))
[docs]def cosine_similarity(F_a, F_b):
"""
Calculate `cosine similarity
<http://en.wikipedia.org/wiki/Cosine_similarity>`_ for sparse feature
vectors.
Parameters
----------
F_a : :class:`.Feature`
F_b : :class:`.Feature`
Returns
-------
similarity : float
Cosine similarity.
"""
shared = list(F_a.unique & F_b.unique)
A = [dict(F_a.norm)[i] for i in shared]
B = [dict(F_b.norm)[i] for i in shared]
dot = sum(map(lambda a, b: a*b, A, B))
mag_A = sqrt(sum(map(lambda a: a**2, A)))
mag_B = sqrt(sum(map(lambda a: a**2, B)))
return dot / (mag_A + mag_B)
[docs]def angular_similarity(F_a, F_b):
"""
Calculate the `angular similarity
<http://en.wikipedia.org/wiki/Cosine_similarity#Angular_similarity>`_ for
sparse feature vectors.
Unlike `cosine_similarity`, this is a true distance metric.
Parameters
----------
F_a : :class:`.Feature`
F_b : :class:`.Feature`
Returns
-------
similarity : float
Cosine similarity.
"""
return 1. - (2. * acos(cosine_similarity(F_a, F_b))) / pi
### Helpers ###
def _shared_features(adense, bdense):
"""
Number of features in ``adense`` that are also in ``bdense``.
"""
a_indices = set(nonzero(adense))
b_indices = set(nonzero(bdense))
shared = list(a_indices & b_indices)
diff = list(a_indices - b_indices)
Ndiff = len(diff)
return Ndiff
def _smoothing_parameters(aprob, bprob, Ndiff):
min_a = min(list([list(aprob)[i] for i in nonzero(aprob)]))
sum_a = sum(aprob)
min_b = min(list([list(bprob)[i] for i in nonzero(bprob)]))
sum_b = sum(bprob)
epsilon = min((min_a/sum_a), (min_b/sum_b)) * 0.001
gamma = 1 - Ndiff * epsilon
return gamma, epsilon
def _smooth(aprob, bprob, Ndiff):
"""
Smooth distributions for KL-divergence according to `Bigi 2003
<http://link.springer.com/chapter/10.1007%2F3-540-36618-0_22?LI=true>`_.
"""
gamma, epsilon = _smoothing_parameters(aprob, bprob, Ndiff)
# Remove zeros.
in_a = [i for i,v in enumerate(aprob) if abs(v) > 0.0]
aprob = list([list(aprob)[i] for i in in_a])
bprob = list([list(bprob)[i]*gamma for i in in_a])
# Replace zero values with epsilon.
bprob = list(map(lambda v: v if v != 0. else epsilon, bprob))
return aprob, bprob