Need help? Have a feature request? Please check out the tethne-users group .
Source code for tethne.networks.topics

"""
Build networks from topics in a topic model.

The current implementation assumes that you are using a :class:`.LDAModel`\.
"""

import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

import networkx
from math import log

import sys
if sys.version_info[0] > 2:
    xrange = range

from tethne.analyze import features
from tethne.networks.base import cooccurrence, coupling
from tethne.utilities import argsort


[docs]def terms(model, threshold=0.01, **kwargs):
    """
    Two terms are coupled if the posterior probability for both terms is
    greather than ``threshold`` for the same topic.

    Parameters
    ----------
    model : :class:`.LDAModel`
    threshold : float
        Default: 0.01
    kwargs : kwargs
        Passed on to :func:`.cooccurrence`\.

    Returns
    -------
    :ref:`networkx.Graph <networkx:graph>`

    """

    select = lambda f, v, c, dc: v > threshold
    graph = cooccurrence(model.phi, filter=select, **kwargs)

    # Only include labels for terms that are actually in the graph.
    label_map = {k: v for k, v in model.vocabulary.items()
                 if k in graph.nodes()}
    graph.name = ''
    return networkx.relabel_nodes(graph, label_map)


[docs]def topic_coupling(model, threshold=None, **kwargs):
    """
    Two papers are coupled if they both contain a shared topic above a
    ``threshold``.

    Parameters
    ----------
    model : :class:`.LDAModel`
    threshold : float
        Default: ``3./model.Z``
    kwargs : kwargs
        Passed on to :func:`.coupling`\.

    Returns
    -------
    :ref:`networkx.Graph <networkx:graph>`

    """
    if not threshold:
        threshold = 3./model.Z
    select = lambda f, v, c, dc: v > threshold

    graph = coupling(model.corpus, 'topics', filter=select, **kwargs)
    graph.name = ''
    return graph


[docs]def cotopics(model, threshold=None, **kwargs):
    """
    Two topics are coupled if they occur (above some ``threshold``) in the same
    document (s).

    Parameters
    ----------
    model : :class:`.LDAModel`
    threshold : float
        Default: ``2./model.Z``
    kwargs : kwargs
        Passed on to :func:`.cooccurrence`\.

    Returns
    -------
    :ref:`networkx.Graph <networkx:graph>`

    """
    if not threshold:
        threshold = 2./model.Z

    select = lambda f, v, c, dc: v > threshold
    return cooccurrence(model.corpus, 'topics', filter=select, **kwargs)


[docs]def distance(model, method='cosine', percentile=90, bidirectional=False,
             normalize=True, smooth=False, transform='log', **kwargs):
    """
    Generate a network of :class:`.Paper`\s based on a distance metric from
    `scipy.spatial.distance
    <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_
    using :ref:`sparse-feature-vector`\s over the dimensions in ``model``.

    The only two methods that will not work in this context are ``hamming`` and
    ``jaccard``.

    Distances are inverted to a similarity metric, which is log-transformed by
    default (see ``transform`` parameter, below). Edges are included if they are
    at or above the ``percentile``th percentile.

    Parameters
    ----------
    model : :class:`.LDAModel` or :class:`.DTMModel`
        :func:`.distance` uses ``model.item`` and ``model.metadata``.
    method : str
        Name of a distance method from `scipy.spatial.distance
        <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_.
        See :func:`.analyze.features.distance` for a list of distance
        statistics. ``hamming`` or ``jaccard`` will raise a RuntimeError.
        :func:`.analyze.features.kl_divergence` is also available as
        'kl_divergence'.
    percentile : int
        (default: 90) Edges are included if they are at or above the
        ``percentile`` for all distances in the ``model``.
    bidirectional : bool
        (default: False) If True, ``method`` is calculated twice for each pair
        of :class:`.Paper`\s ( ``(i,j)`` and ``(j,i)`` ), and the mean is used.
    normalize : bool
        (default: True) If True, vectors over topics are normalized so that they
        sum to 1.0 for each :class:`.Paper`.
    smooth : bool
        (default: False) If True, vectors over topics are smoothed according to
        `Bigi 2003
        <http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_.
        This may be useful if vectors over topics are very sparse.
    transform : str
        (default: 'log') Transformation to apply to similarity values before
        building the graph. So far only 'log' and None are supported.

    Returns
    -------
    :ref:`networkx.Graph <networkx:graph>`
        Similarity values are included as edge weights. Node attributes are set
        using the fields in ``model.metadata``. See
        :meth:`networkx.Graph.__init__`

    Examples
    --------

    .. code-block:: python

       >>> from tethne.networks import topics
       >>> thegraph = topics.distance(myLDAModel, 'cosine')

       >>> import tethne.writers as wr
       >>> wr.to_graphml(thegraph, '~./thegraph.graphml')

    .. figure:: _static/images/lda_cosine_network.png
       :width: 80%

       Edge weight and opacity indicate similarity. Node color indicates the
       journal in which each :class:`.Paper` was published. In this graph,
       papers published in the same journal tend to cluster together.

    """

    if method in ['hamming','jaccard']:
        raise RuntimeError(
            'There is no sensicle interpretation of {0} for these data.'
                                                            .format(method))

    thegraph = networkx.Graph()

    edges = {}
    for i in xrange(model.M):
        for j in xrange(i+1, model.M):
            if method == 'kl_divergence':   # Not a SciPy method.
                dist = features.kl_divergence( model.item(i), model.item(j) )
                dist_ = features.kl_divergence( model.item(j), model.item(i) )
                dist = (dist + dist_)/2.
            else:
                dist = features.distance( model.item(i), model.item(j), method,
                                          normalize=normalize, smooth=smooth  )

            if bidirectional:
                dist_ = features.distance(
                            model.item(j), model.item(i), method,
                            normalize=normalize, smooth=smooth  )

                dist = (dist + dist_)/2.

            sim = 1./dist

            if transform == 'log':
                sim = log(sim)

            edges[(i,j)] = sim

    # pct = numpy.percentile(edges.values(), percentile)
    pct = int(round(len(edges)*(percentile/100.)))
    for i in argsort(edges.values())[::-1][:pct]:
        edge, sim = edges.keys()[i], edges.values()[i]
        thegraph.add_edge(edge[0], edge[1], weight=float(sim))

    for key in model.metadata[0].keys():
        values = { k:v[key] for k,v in model.metadata.items()
                                if k in thegraph.nodes()    }
        networkx.set_node_attributes(thegraph, key, values)

    return thegraph