SciPy

Source code for tethne.networks.topics

"""
Build networks from topics in a topic model.

The current implementation assumes that you are using a :class:`.LDAModel`\.
"""

import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

import networkx
from math import log

import sys
if sys.version_info[0] > 2:
    xrange = range

from tethne.analyze import features
from tethne.networks.base import cooccurrence, coupling
from tethne.utilities import argsort


[docs]def terms(model, threshold=0.01, **kwargs): """ Two terms are coupled if the posterior probability for both terms is greather than ``threshold`` for the same topic. Parameters ---------- model : :class:`.LDAModel` threshold : float Default: 0.01 kwargs : kwargs Passed on to :func:`.cooccurrence`\. Returns ------- :ref:`networkx.Graph <networkx:graph>` """ select = lambda f, v, c, dc: v > threshold graph = cooccurrence(model.phi, filter=select, **kwargs) # Only include labels for terms that are actually in the graph. label_map = {k: v for k, v in model.vocabulary.items() if k in graph.nodes()} graph.name = '' return networkx.relabel_nodes(graph, label_map)
[docs]def topic_coupling(model, threshold=None, **kwargs): """ Two papers are coupled if they both contain a shared topic above a ``threshold``. Parameters ---------- model : :class:`.LDAModel` threshold : float Default: ``3./model.Z`` kwargs : kwargs Passed on to :func:`.coupling`\. Returns ------- :ref:`networkx.Graph <networkx:graph>` """ if not threshold: threshold = 3./model.Z select = lambda f, v, c, dc: v > threshold graph = coupling(model.corpus, 'topics', filter=select, **kwargs) graph.name = '' return graph
[docs]def cotopics(model, threshold=None, **kwargs): """ Two topics are coupled if they occur (above some ``threshold``) in the same document (s). Parameters ---------- model : :class:`.LDAModel` threshold : float Default: ``2./model.Z`` kwargs : kwargs Passed on to :func:`.cooccurrence`\. Returns ------- :ref:`networkx.Graph <networkx:graph>` """ if not threshold: threshold = 2./model.Z select = lambda f, v, c, dc: v > threshold return cooccurrence(model.corpus, 'topics', filter=select, **kwargs)
[docs]def distance(model, method='cosine', percentile=90, bidirectional=False, normalize=True, smooth=False, transform='log', **kwargs): """ Generate a network of :class:`.Paper`\s based on a distance metric from `scipy.spatial.distance <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ using :ref:`sparse-feature-vector`\s over the dimensions in ``model``. The only two methods that will not work in this context are ``hamming`` and ``jaccard``. Distances are inverted to a similarity metric, which is log-transformed by default (see ``transform`` parameter, below). Edges are included if they are at or above the ``percentile``th percentile. Parameters ---------- model : :class:`.LDAModel` or :class:`.DTMModel` :func:`.distance` uses ``model.item`` and ``model.metadata``. method : str Name of a distance method from `scipy.spatial.distance <http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_. See :func:`.analyze.features.distance` for a list of distance statistics. ``hamming`` or ``jaccard`` will raise a RuntimeError. :func:`.analyze.features.kl_divergence` is also available as 'kl_divergence'. percentile : int (default: 90) Edges are included if they are at or above the ``percentile`` for all distances in the ``model``. bidirectional : bool (default: False) If True, ``method`` is calculated twice for each pair of :class:`.Paper`\s ( ``(i,j)`` and ``(j,i)`` ), and the mean is used. normalize : bool (default: True) If True, vectors over topics are normalized so that they sum to 1.0 for each :class:`.Paper`. smooth : bool (default: False) If True, vectors over topics are smoothed according to `Bigi 2003 <http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_. This may be useful if vectors over topics are very sparse. transform : str (default: 'log') Transformation to apply to similarity values before building the graph. So far only 'log' and None are supported. Returns ------- :ref:`networkx.Graph <networkx:graph>` Similarity values are included as edge weights. Node attributes are set using the fields in ``model.metadata``. See :meth:`networkx.Graph.__init__` Examples -------- .. code-block:: python >>> from tethne.networks import topics >>> thegraph = topics.distance(myLDAModel, 'cosine') >>> import tethne.writers as wr >>> wr.to_graphml(thegraph, '~./thegraph.graphml') .. figure:: _static/images/lda_cosine_network.png :width: 80% Edge weight and opacity indicate similarity. Node color indicates the journal in which each :class:`.Paper` was published. In this graph, papers published in the same journal tend to cluster together. """ if method in ['hamming','jaccard']: raise RuntimeError( 'There is no sensicle interpretation of {0} for these data.' .format(method)) thegraph = networkx.Graph() edges = {} for i in xrange(model.M): for j in xrange(i+1, model.M): if method == 'kl_divergence': # Not a SciPy method. dist = features.kl_divergence( model.item(i), model.item(j) ) dist_ = features.kl_divergence( model.item(j), model.item(i) ) dist = (dist + dist_)/2. else: dist = features.distance( model.item(i), model.item(j), method, normalize=normalize, smooth=smooth ) if bidirectional: dist_ = features.distance( model.item(j), model.item(i), method, normalize=normalize, smooth=smooth ) dist = (dist + dist_)/2. sim = 1./dist if transform == 'log': sim = log(sim) edges[(i,j)] = sim # pct = numpy.percentile(edges.values(), percentile) pct = int(round(len(edges)*(percentile/100.))) for i in argsort(edges.values())[::-1][:pct]: edge, sim = edges.keys()[i], edges.values()[i] thegraph.add_edge(edge[0], edge[1], weight=float(sim)) for key in model.metadata[0].keys(): values = { k:v[key] for k,v in model.metadata.items() if k in thegraph.nodes() } networkx.set_node_attributes(thegraph, key, values) return thegraph