Source code for tethne.analyze.features
"""
Methods for analyzing featuresets.
.. autosummary::
:nosignatures:
cosine_distance
cosine_similarity
distance
kl_divergence
"""
import numpy
from scipy.sparse import coo_matrix
import scipy.spatial as spat
[docs]def distance(sa, sb, method, normalize=True, smooth=False):
"""
Calculate the distance between two sparse feature vectors using a method
from scipy.spatial.distance.
Supported distance methods:
================ ====================
Method Documentation
================ ====================
braycurtis `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.braycurtis.html>`_
canberra `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.canberra.html>`_
chebyshev `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.chebyshev.html>`_
cityblock `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cityblock.html>`_
correlation `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.correlation.html>`_
cosine `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html>`_
dice `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.dice.html>`_
euclidean `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.euclidean.html>`_
hamming `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.hamming.html>`_
jaccard `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html>`_
kulsinski `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.kulsinski.html>`_
matching `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.matching.html>`_
rogerstanimoto `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.rogerstanimoto.html>`_
russellrao `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.russellrao.html>`_
sokalmichener `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sokalmichener.html>`_
sokalsneath `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sokalsneath.html>`_
sqeuclidean `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sqeuclidean.html>`_
yule `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html>`_
================ ====================
Parameters
----------
sa : list
A :ref:`sparse-feature-vector`\.
sb : list
A :ref:`sparse-feature-vector`\.
method : str
Name of a method in scipy.spatial.distance (see above).
normalize : bool
(default: True) If True, ``sa`` and ``sb`` are normalized so that they
each sum to 1.0.
smooth : bool
(default: False) If True, uses the smoothing method described in `Bigi
2003
<http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_
Returns
-------
distance : float
Distance value from ``method``.
"""
if method not in spat.distance.__dict__:
raise RuntimeError('No method named {0} in scipy.spatial.distance'
.format(method))
# Convert sparse data into dense arrays.
amax = max(zip(*sa)[0])
bmax = max(zip(*sb)[0])
adense = _sparse_to_array(sa, size=max(amax, bmax)+1)
bdense = _sparse_to_array(sb, size=max(amax, bmax)+1)
if normalize:
adense = numpy.array(adense/float(numpy.sum(adense)))
bdense = numpy.array(bdense/float(numpy.sum(bdense)))
if smooth:
# Smooth according to Bigi 2003.
Ndiff = _shared_features(adense, bdense)
adense, bdense = _smooth(adense, bdense, Ndiff)
return spat.distance.__dict__[method](adense, bdense)
[docs]def kl_divergence(sa, sb):
"""
Calculate Kullback-Leibler Distance for sparse feature vectors.
Uses the smoothing method described in `Bigi 2003
<http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_
to facilitate better comparisons between vectors describing wordcounts.
Parameters
----------
sa : list
A :ref:`sparse-feature-vector`\.
sb : list
A :ref:`sparse-feature-vector`\.
Returns
-------
divergence : float
KL divergence.
"""
# Convert sparse data into dense arrays.
adense = _sparse_to_array(sa)
bdense = _sparse_to_array(sb, size=adense.size)
# Find shared features.
Ndiff = _shared_features(adense, bdense)
# aprob and bprob should each sum to 1.0
aprob = numpy.array(adense/float(numpy.sum(adense)))
bprob = numpy.array(bdense/float(numpy.sum(bdense)))
# Smooth according to Bigi 2003.
aprob, bprob = _smooth(aprob, bprob, Ndiff)
divergence = numpy.sum( (aprob - bprob) * numpy.log(aprob/bprob) )
return divergence
[docs]def cosine_distance(sa, sb):
"""
Calculate `cosine distance
<http://en.wikipedia.org/wiki/Cosine_similarity>`_ for sparse feature
vectors.
Uses the `cosine method in scipy.spatial.distance
<http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine>`_.
Parameters
----------
sa : list
A :ref:`sparse-feature-vector`\.
sb : list
A :ref:`sparse-feature-vector`\.
Returns
-------
distance : float
Cosine distance.
"""
# Convert sparse data into dense arrays.
amax = max(zip(*sa)[0])
bmax = max(zip(*sb)[0])
adense = _sparse_to_array(sa, size=max(amax, bmax)+1)
bdense = _sparse_to_array(sb, size=max(amax, bmax)+1)
from scipy.spatial.distance import cosine
distance = cosine(adense, bdense)
return distance
[docs]def cosine_similarity(sa, sb):
"""
Calculate `cosine similarity
<http://en.wikipedia.org/wiki/Cosine_similarity>`_ for sparse feature
vectors.
Uses the `cosine method in scipy.spatial.distance
<http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine>`_.
Parameters
----------
sa : list
A :ref:`sparse-feature-vector`\.
sb : list
A :ref:`sparse-feature-vector`\.
Returns
-------
similarity : float
Cosine similarity
"""
similarity = -1.*(cosine_distance(sa, sb) - 1.)
return similarity
### Helpers ###
def _sparse_to_coo(svect, size=None):
"""
Convert a sparse feature [(i,v),] to a SciPy COO sparse matrix.
"""
svect = [ (t,v) for t,v in svect if v != 0 ]
JK = zip(*svect)
J = list(JK[0])
K = list(JK[1])
I = [0]*len(J)
# Adding a null value with an index of size-1 makes it easier to work with
# arrays of different size.
if size is not None:
I.append(0)
J.append(size-1)
K.append(0)
coo = coo_matrix((K, (I,J)), shape=(1, max(J)+1))
return coo
def _sparse_to_array(coo, size=None):
"""
Convert a sparse feature [(i,v),] to a vector array.
"""
dense = _sparse_to_coo(coo, size=size).todense()
l = dense.shape[1]
return dense.reshape((l,))
def _shared_features(adense, bdense):
"""
Number of features in ``adense`` that are also in ``bdense``.
"""
a_indices = set(numpy.nonzero(adense)[0])
b_indices = set(numpy.nonzero(bdense)[0])
shared = list(a_indices & b_indices)
diff = list(a_indices - b_indices)
Ndiff = len(diff)
return Ndiff
def _smoothing_parameters(aprob, bprob, Ndiff):
min_a = numpy.min(aprob[numpy.nonzero(aprob)])
sum_a = numpy.sum(aprob)
min_b = numpy.min(bprob[numpy.nonzero(bprob)])
sum_b = numpy.sum(bprob)
epsilon = min((min_a/sum_a), (min_b/sum_b)) * 0.001
gamma = 1 - Ndiff * epsilon
return gamma, epsilon
def _smooth(aprob, bprob, Ndiff):
"""
Smooth distributions for KL-divergence according to `Bigi 2003
<http://link.springer.com/chapter/10.1007%2F3-540-36618-0_22?LI=true>`_.
"""
gamma, epsilon = _smoothing_parameters(aprob, bprob, Ndiff)
# Remove zeros.
in_a = numpy.nonzero(aprob) # Use these indices only.
aprob = aprob[in_a]
bprob = bprob[in_a]*gamma
# Replace zero values with epsilon.
numpy.place(bprob, bprob == 0., (epsilon,))
return aprob, bprob