Source code for tethne.networks.topics

"""
Build networks from topics in a topic model.
"""

import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

import networkx
#from scipy import stats
import numpy

from ..analyze import features

[docs]def distance(   model, method='cosine', percentile=90, bidirectional=False,
                normalize=True, smooth=False, transform='log'    ):
    """
    Generate a network of :class:`.Paper`\s based on a distance metric from
    `scipy.spatial.distance 
    <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_
    using :ref:`sparse-feature-vector`\s over the dimensions in ``model``.
    
    Refer to the documentation for :func:`.analyze.features.distance` for
    a list of distance statistics. The only two methods that will not work
    in this context are ``hamming`` and ``jaccard``.

    Distances are inverted to a similarity metric, which is log-transformed by 
    default (see ``transform`` parameter, below). Edges are included if they are
    at or above the ``percentile``th percentile.
    
    Parameters
    ----------
    model : :class:`.LDAModel` or :class:`.DTMModel`
        :func:`.distance` uses ``model.item`` and ``model.metadata``.
    method : str
        Name of a distance method from `scipy.spatial.distance
        <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_.
        See :func:`.analyze.features.distance` for a list of distance
        statistics. ``hamming`` or ``jaccard`` will raise a RuntimeError.
        :func:`.analyze.features.kl_divergence` is also available as
        'kl_divergence'.
    percentile : int
        (default: 90) Edges are included if they are at or above the 
        ``percentile`` for all distances in the ``model``.
    bidirectional : bool
        (default: False) If True, ``method`` is calculated twice for each pair
        of :class:`.Paper`\s ( ``(i,j)`` and ``(j,i)`` ), and the mean is used.
    normalize : bool
        (default: True) If True, vectors over topics are normalized so that they
        sum to 1.0 for each :class:`.Paper`.
    smooth : bool
        (default: False) If True, vectors over topics are smoothed according to 
        `Bigi 2003 
        <http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_.
        This may be useful if vectors over topics are very sparse.
    transform : str
        (default: 'log') Transformation to apply to similarity values before
        building the graph. So far only 'log' and None are supported.
        
    Returns
    -------
    thegraph : networkx.Graph
        Similarity values are included as edge weights. Node attributes are set
        using the fields in ``model.metadata``.
        
    Examples
    --------
    
    .. code-block:: python
    
       >>> from tethne.networks import topics
       >>> thegraph = topics.distance(MyLDAModel, 'cosine')

       >>> from tethne.writers import graph
       >>> graph.to_graphml(thegraph, '~./thegraph.graphml')
       
    .. figure:: _static/images/lda_cosine_network.png
       :width: 80%
       
       Edge weight and opacity indicate similarity. Node color indicates the
       journal in which each :class:`.Paper` was published. In this graph,
       papers published in the same journal tend to cluster together.

    """
    
    if method in ['hamming','jaccard']:
        raise RuntimeError(
            'There is no sensicle interpretation of {0} for these data.'
                                                            .format(method))
    
    thegraph = networkx.Graph()
    
    edges = {}
    for i in xrange(model.M):
        for j in xrange(i+1, model.M):
            if method == 'kl_divergence':   # Not a SciPy method.
                dist = features.kl_divergence( model.item(i), model.item(j) )
                dist_ = features.kl_divergence( model.item(j), model.item(i) )
                dist = (dist + dist_)/2.
            else:
                dist = features.distance( model.item(i), model.item(j), method,
                                          normalize=normalize, smooth=smooth  )

            if bidirectional:
                dist_ = features.distance(
                            model.item(j), model.item(i), method,
                            normalize=normalize, smooth=smooth  )
                            
                dist = (dist + dist_)/2.
            
            sim = 1./dist
            
            if transform == 'log':
                sim = numpy.log(sim)

            edges[(i,j)] = sim
            
    pct = numpy.percentile(edges.values(), percentile)
    for edge, sim in edges.iteritems():
        if sim >= pct:
            thegraph.add_edge(edge[0], edge[1], weight=float(sim))

    for key in model.metadata[0].keys():
        values = { k:v[key] for k,v in model.metadata.iteritems()
                                if k in thegraph.nodes()    }
        networkx.set_node_attributes(thegraph, key, values)

    return thegraph

#def paper_coupling(model, threshold=0.1):
#    """
#    """
#
#    D = model.doc_topic.shape[0]
#    Z = model.doc_topic.shape[1]
#
#    edges = {}
#    for d in xrange(D):
#        d_s = model.doc_topic[d,:]
#        for i in xrange(Z):
#            for j in xrange(i+1, Z):
#                if d_s[i] >= threshold and d_s[j] >= threshold:
#                    try:
#                        edges[(i,j)].append( (d, d_s[i]*d_s[j]/2) )
#                    except KeyError:
#                        edges[(i,j)] = [(d, d_s[i]*d_s[j]/2)]
#
#    pc = nx.Graph()
#
#    for e, papers in edges.iteritems():
#        weight = sum( [p[1] for p in papers] ) / D
#        pc.add_edge(e[0], e[1], weight=weight, \
#                    papers=[model.metadata[p[0]] for p in papers])
#
#    for t in pc.nodes():
#        pc.node[t]['words'] = model.top_keys[t][1]  # Add list of top words.
#
#    return pc
#
#def term_coupling(model, threshold=0.01):
#    """
#    """
#
#    Z = model.top_word.shape[0]
#    W = model.top_word.shape[1]
#
#    edges = {}
#    for w in xrange(W):
#        t_sub = []
#
#        for z in xrange(Z):
#            if model.top_word[z,w] >= threshold:
#                t_sub.append(z)
#
#        for i in xrange(len(t_sub)):
#            for j in xrange(i+1, len(t_sub)):
#                t_i = t_sub[i]
#                t_j = t_sub[j]
#                p_i = model.top_word[t_i,w]
#                p_j = model.top_word[t_j,w]
#                try:
#                    edges[(t_i,t_j)].append((w, (p_i+p_j)/2))
#                except KeyError:
#                    edges[(t_i,t_j)] = [(w, (p_i+p_j)/2)]
#    tc = nx.Graph()
#
#    #print edges
#
#    for e, words in edges.iteritems():
#        weight = sum( [ w[1] for w in words ] ) / W
#        word_list = [model.vocabulary[w[0]] for w in words]
#        tc.add_edge(e[0], e[1], weight=weight, words=word_list)
#
#    for t in tc.nodes():
#        tc.node[t]['words'] = model.top_keys[t][1]  # Add list of top words.
#
#    return tc
#    
#def topic_coupling(model, papers=None, threshold=None):
#    """
#    Builds a network of topics using inverse symmetric KL-divergence on papers.
#    
#    If `papers` is not None, uses only those papers provided to calculate
#    KL-divergence.
#    
#    Parameters
#    ----------
#    model : :class:`.LDAModel`
#    papers : list
#        A list of paper indices to use in KL-divergence calculation.
#    threshold : float
#        Minimum inverse symmetric KL-divergence for an edge. (default = 0.25)
#    """
#    
#    Z = model.top_word.shape[0]
#    G = nx.Graph()
#    
#    if threshold is None:
#        # Scaling factor to remove negative correlation between N_d and number 
#        # of edges.
#        threshold = len(papers)**-0.2 + 0.1
#        
#    if papers is None:
#        dt_matrix = model.doc_topic
#    else:
#        N_d = len(papers)
#        dt_matrix = np.zeros((N_d, Z))
#        for d in xrange(N_d):
#            dt_matrix[d, :] = model.doc_topic[papers[d], :]
#
#    for i in xrange(Z):
#        for j in xrange(i+1, Z):
#            D_ij = stats.entropy(dt_matrix[:,i], dt_matrix[:,j])
#            D_ji = stats.entropy(dt_matrix[:,j], dt_matrix[:,i])
#            iD_sym = float(1/(D_ij + D_ji))
#            
#            if iD_sym >= threshold:
#                G.add_node(j, label=', '.join(model.top_keys[i][1]))
#                G.add_edge(i,j,weight=iD_sym)
#    
#    return G