Source code for tethne.analyze.features

"""
Methods for analyzing featuresets.

.. autosummary::
   :nosignatures:
   
   cosine_distance
   cosine_similarity
   distance
   kl_divergence
   
"""

import numpy
from scipy.sparse import coo_matrix
import scipy.spatial as spat

[docs]def distance(sa, sb, method, normalize=True, smooth=False):
    """
    Calculate the distance between two sparse feature vectors using a method
    from scipy.spatial.distance.
    
    Supported distance methods:

    ================    ====================
    Method              Documentation
    ================    ====================
    braycurtis          `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.braycurtis.html>`_
    canberra            `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.canberra.html>`_
    chebyshev           `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.chebyshev.html>`_
    cityblock           `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cityblock.html>`_
    correlation         `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.correlation.html>`_
    cosine              `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html>`_
    dice                `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.dice.html>`_
    euclidean           `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.euclidean.html>`_
    hamming             `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.hamming.html>`_
    jaccard             `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html>`_
    kulsinski           `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.kulsinski.html>`_
    matching            `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.matching.html>`_
    rogerstanimoto      `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.rogerstanimoto.html>`_
    russellrao          `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.russellrao.html>`_
    sokalmichener       `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sokalmichener.html>`_
    sokalsneath         `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sokalsneath.html>`_
    sqeuclidean         `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.sqeuclidean.html>`_
    yule                `scipy.org <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.yule.html>`_
    ================    ====================

    Parameters
    ----------
    sa : list
        A  :ref:`sparse-feature-vector`\.
    sb : list
        A  :ref:`sparse-feature-vector`\.
    method : str
        Name of a method in scipy.spatial.distance (see above).
    normalize : bool
        (default: True) If True, ``sa`` and ``sb`` are normalized so that they
        each sum to 1.0.
    smooth : bool
        (default: False) If True, uses the smoothing method described in `Bigi
        2003 
        <http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_
        
    Returns
    -------
    distance : float
        Distance value from ``method``.

    """
    if method not in spat.distance.__dict__:
        raise RuntimeError('No method named {0} in scipy.spatial.distance'
                                                                .format(method))
    
    # Convert sparse data into dense arrays.
    amax = max(zip(*sa)[0])
    bmax = max(zip(*sb)[0])
    adense = _sparse_to_array(sa, size=max(amax, bmax)+1)
    bdense = _sparse_to_array(sb, size=max(amax, bmax)+1)
    
    if normalize:
        adense = numpy.array(adense/float(numpy.sum(adense)))
        bdense = numpy.array(bdense/float(numpy.sum(bdense)))

    if smooth:
        # Smooth according to Bigi 2003.
        Ndiff = _shared_features(adense, bdense)
        adense, bdense = _smooth(adense, bdense, Ndiff)

    return spat.distance.__dict__[method](adense, bdense)

[docs]def kl_divergence(sa, sb):
    """
    Calculate Kullback-Leibler Distance for sparse feature vectors.
    
    Uses the smoothing method described in `Bigi 2003 
    <http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_
    to facilitate better comparisons between vectors describing wordcounts.
    
    Parameters
    ----------
    sa : list
        A  :ref:`sparse-feature-vector`\.
    sb : list
        A  :ref:`sparse-feature-vector`\.
        
    Returns
    -------
    divergence : float
        KL divergence.
    """
    # Convert sparse data into dense arrays.
    adense = _sparse_to_array(sa)
    bdense = _sparse_to_array(sb, size=adense.size)
 
    # Find shared features.
    Ndiff = _shared_features(adense, bdense)
    
    # aprob and bprob should each sum to 1.0
    aprob = numpy.array(adense/float(numpy.sum(adense)))
    bprob = numpy.array(bdense/float(numpy.sum(bdense)))

    # Smooth according to Bigi 2003.
    aprob, bprob = _smooth(aprob, bprob, Ndiff)

    divergence = numpy.sum( (aprob - bprob) * numpy.log(aprob/bprob) )

    return divergence

[docs]def cosine_distance(sa, sb):
    """
    Calculate `cosine distance
    <http://en.wikipedia.org/wiki/Cosine_similarity>`_ for sparse feature 
    vectors.
    
    Uses the `cosine method in scipy.spatial.distance
    <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine>`_.
    
    Parameters
    ----------
    sa : list
        A  :ref:`sparse-feature-vector`\.
    sb : list
        A  :ref:`sparse-feature-vector`\.
        
    Returns
    -------
    distance : float
        Cosine distance.
    """
    # Convert sparse data into dense arrays.
    amax = max(zip(*sa)[0])
    bmax = max(zip(*sb)[0])
    adense = _sparse_to_array(sa, size=max(amax, bmax)+1)
    bdense = _sparse_to_array(sb, size=max(amax, bmax)+1)

    from scipy.spatial.distance import cosine
    distance = cosine(adense, bdense)
    
    return distance

[docs]def cosine_similarity(sa, sb):
    """
    Calculate `cosine similarity 
    <http://en.wikipedia.org/wiki/Cosine_similarity>`_ for sparse feature 
    vectors.
    
    Uses the `cosine method in scipy.spatial.distance
    <http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html#scipy.spatial.distance.cosine>`_.
    
    Parameters
    ----------
    sa : list
        A  :ref:`sparse-feature-vector`\.
    sb : list
        A  :ref:`sparse-feature-vector`\.
        
    Returns
    -------
    similarity : float
        Cosine similarity
    """

    similarity = -1.*(cosine_distance(sa, sb) - 1.)
    return similarity


### Helpers ###

def _sparse_to_coo(svect, size=None):
    """
    Convert a sparse feature [(i,v),] to a SciPy COO sparse matrix.
    """
    svect = [ (t,v) for t,v in svect if v != 0 ]
    JK = zip(*svect)
    J = list(JK[0])
    K = list(JK[1])
    I = [0]*len(J)
    
    # Adding a null value with an index of size-1 makes it easier to work with
    #  arrays of different size.
    if size is not None:
        I.append(0)
        J.append(size-1)
        K.append(0)
    coo = coo_matrix((K, (I,J)), shape=(1, max(J)+1))
    return coo

def _sparse_to_array(coo, size=None):
    """
    Convert a sparse feature [(i,v),] to a vector array.
    """
    dense = _sparse_to_coo(coo, size=size).todense()
    l = dense.shape[1]
    return dense.reshape((l,))

def _shared_features(adense, bdense):
    """
    Number of features in ``adense`` that are also in ``bdense``.
    """
    a_indices = set(numpy.nonzero(adense)[0])
    b_indices = set(numpy.nonzero(bdense)[0])  
    
    shared = list(a_indices & b_indices)
    diff = list(a_indices - b_indices) 
    Ndiff = len(diff) 
    
    return Ndiff

def _smoothing_parameters(aprob, bprob, Ndiff):
    min_a = numpy.min(aprob[numpy.nonzero(aprob)])
    sum_a = numpy.sum(aprob)    
    min_b = numpy.min(bprob[numpy.nonzero(bprob)])
    sum_b = numpy.sum(bprob)
    
    epsilon = min((min_a/sum_a), (min_b/sum_b)) * 0.001
    gamma = 1 - Ndiff * epsilon   
    
    return gamma, epsilon

def _smooth(aprob, bprob, Ndiff):
    """
    Smooth distributions for KL-divergence according to `Bigi 2003
    <http://link.springer.com/chapter/10.1007%2F3-540-36618-0_22?LI=true>`_.
    """
    gamma, epsilon = _smoothing_parameters(aprob, bprob, Ndiff)

    # Remove zeros.
    in_a = numpy.nonzero(aprob)  # Use these indices only.
    aprob = aprob[in_a]
    bprob = bprob[in_a]*gamma
    
    # Replace zero values with epsilon.
    numpy.place(bprob, bprob == 0., (epsilon,))
    
    return aprob, bprob