Source code for tethne.networks.topics
"""
Build networks from topics in a topic model.
"""
import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')
import networkx
#from scipy import stats
import numpy
from ..analyze import features
[docs]def distance( model, method='cosine', percentile=90, bidirectional=False,
normalize=True, smooth=False, transform='log' ):
"""
Generate a network of :class:`.Paper`\s based on a distance metric from
`scipy.spatial.distance
<http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_
using :ref:`sparse-feature-vector`\s over the dimensions in ``model``.
Refer to the documentation for :func:`.analyze.features.distance` for
a list of distance statistics. The only two methods that will not work
in this context are ``hamming`` and ``jaccard``.
Distances are inverted to a similarity metric, which is log-transformed by
default (see ``transform`` parameter, below). Edges are included if they are
at or above the ``percentile``th percentile.
Parameters
----------
model : :class:`.LDAModel` or :class:`.DTMModel`
:func:`.distance` uses ``model.item`` and ``model.metadata``.
method : str
Name of a distance method from `scipy.spatial.distance
<http://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_.
See :func:`.analyze.features.distance` for a list of distance
statistics. ``hamming`` or ``jaccard`` will raise a RuntimeError.
:func:`.analyze.features.kl_divergence` is also available as
'kl_divergence'.
percentile : int
(default: 90) Edges are included if they are at or above the
``percentile`` for all distances in the ``model``.
bidirectional : bool
(default: False) If True, ``method`` is calculated twice for each pair
of :class:`.Paper`\s ( ``(i,j)`` and ``(j,i)`` ), and the mean is used.
normalize : bool
(default: True) If True, vectors over topics are normalized so that they
sum to 1.0 for each :class:`.Paper`.
smooth : bool
(default: False) If True, vectors over topics are smoothed according to
`Bigi 2003
<http://lvk.cs.msu.su/~bruzz/articles/classification/Using%20Kullback-Leibler%20Distance%20for%20Text%20Categorization.pdf>`_.
This may be useful if vectors over topics are very sparse.
transform : str
(default: 'log') Transformation to apply to similarity values before
building the graph. So far only 'log' and None are supported.
Returns
-------
thegraph : networkx.Graph
Similarity values are included as edge weights. Node attributes are set
using the fields in ``model.metadata``.
Examples
--------
.. code-block:: python
>>> from tethne.networks import topics
>>> thegraph = topics.distance(MyLDAModel, 'cosine')
>>> from tethne.writers import graph
>>> graph.to_graphml(thegraph, '~./thegraph.graphml')
.. figure:: _static/images/lda_cosine_network.png
:width: 80%
Edge weight and opacity indicate similarity. Node color indicates the
journal in which each :class:`.Paper` was published. In this graph,
papers published in the same journal tend to cluster together.
"""
if method in ['hamming','jaccard']:
raise RuntimeError(
'There is no sensicle interpretation of {0} for these data.'
.format(method))
thegraph = networkx.Graph()
edges = {}
for i in xrange(model.M):
for j in xrange(i+1, model.M):
if method == 'kl_divergence': # Not a SciPy method.
dist = features.kl_divergence( model.item(i), model.item(j) )
dist_ = features.kl_divergence( model.item(j), model.item(i) )
dist = (dist + dist_)/2.
else:
dist = features.distance( model.item(i), model.item(j), method,
normalize=normalize, smooth=smooth )
if bidirectional:
dist_ = features.distance(
model.item(j), model.item(i), method,
normalize=normalize, smooth=smooth )
dist = (dist + dist_)/2.
sim = 1./dist
if transform == 'log':
sim = numpy.log(sim)
edges[(i,j)] = sim
pct = numpy.percentile(edges.values(), percentile)
for edge, sim in edges.iteritems():
if sim >= pct:
thegraph.add_edge(edge[0], edge[1], weight=float(sim))
for key in model.metadata[0].keys():
values = { k:v[key] for k,v in model.metadata.iteritems()
if k in thegraph.nodes() }
networkx.set_node_attributes(thegraph, key, values)
return thegraph
#def paper_coupling(model, threshold=0.1):
# """
# """
#
# D = model.doc_topic.shape[0]
# Z = model.doc_topic.shape[1]
#
# edges = {}
# for d in xrange(D):
# d_s = model.doc_topic[d,:]
# for i in xrange(Z):
# for j in xrange(i+1, Z):
# if d_s[i] >= threshold and d_s[j] >= threshold:
# try:
# edges[(i,j)].append( (d, d_s[i]*d_s[j]/2) )
# except KeyError:
# edges[(i,j)] = [(d, d_s[i]*d_s[j]/2)]
#
# pc = nx.Graph()
#
# for e, papers in edges.iteritems():
# weight = sum( [p[1] for p in papers] ) / D
# pc.add_edge(e[0], e[1], weight=weight, \
# papers=[model.metadata[p[0]] for p in papers])
#
# for t in pc.nodes():
# pc.node[t]['words'] = model.top_keys[t][1] # Add list of top words.
#
# return pc
#
#def term_coupling(model, threshold=0.01):
# """
# """
#
# Z = model.top_word.shape[0]
# W = model.top_word.shape[1]
#
# edges = {}
# for w in xrange(W):
# t_sub = []
#
# for z in xrange(Z):
# if model.top_word[z,w] >= threshold:
# t_sub.append(z)
#
# for i in xrange(len(t_sub)):
# for j in xrange(i+1, len(t_sub)):
# t_i = t_sub[i]
# t_j = t_sub[j]
# p_i = model.top_word[t_i,w]
# p_j = model.top_word[t_j,w]
# try:
# edges[(t_i,t_j)].append((w, (p_i+p_j)/2))
# except KeyError:
# edges[(t_i,t_j)] = [(w, (p_i+p_j)/2)]
# tc = nx.Graph()
#
# #print edges
#
# for e, words in edges.iteritems():
# weight = sum( [ w[1] for w in words ] ) / W
# word_list = [model.vocabulary[w[0]] for w in words]
# tc.add_edge(e[0], e[1], weight=weight, words=word_list)
#
# for t in tc.nodes():
# tc.node[t]['words'] = model.top_keys[t][1] # Add list of top words.
#
# return tc
#
#def topic_coupling(model, papers=None, threshold=None):
# """
# Builds a network of topics using inverse symmetric KL-divergence on papers.
#
# If `papers` is not None, uses only those papers provided to calculate
# KL-divergence.
#
# Parameters
# ----------
# model : :class:`.LDAModel`
# papers : list
# A list of paper indices to use in KL-divergence calculation.
# threshold : float
# Minimum inverse symmetric KL-divergence for an edge. (default = 0.25)
# """
#
# Z = model.top_word.shape[0]
# G = nx.Graph()
#
# if threshold is None:
# # Scaling factor to remove negative correlation between N_d and number
# # of edges.
# threshold = len(papers)**-0.2 + 0.1
#
# if papers is None:
# dt_matrix = model.doc_topic
# else:
# N_d = len(papers)
# dt_matrix = np.zeros((N_d, Z))
# for d in xrange(N_d):
# dt_matrix[d, :] = model.doc_topic[papers[d], :]
#
# for i in xrange(Z):
# for j in xrange(i+1, Z):
# D_ij = stats.entropy(dt_matrix[:,i], dt_matrix[:,j])
# D_ji = stats.entropy(dt_matrix[:,j], dt_matrix[:,i])
# iD_sym = float(1/(D_ij + D_ji))
#
# if iD_sym >= threshold:
# G.add_node(j, label=', '.join(model.top_keys[i][1]))
# G.add_edge(i,j,weight=iD_sym)
#
# return G