SciPy

Source code for tethne.networks.features

"""
Methods for building networks from terms in bibliographic records. This
includes keywords, abstract terms, etc.

.. autosummary::
   :nosignatures:

   cooccurrence
   mutual_information
   keyword_cooccurrence
   topic_coupling

"""

from math import log
import networkx as nx
import warnings

from tethne.networks.base import cooccurrence, coupling, multipartite

def _nPMI(p_ij, p_i, p_j):
    lower = (-1.*log(p_ij))
    joint = (p_i*p_j)
    if lower == 0. or joint == 0.:
        return 0.
    return (log(p_ij/joint))/(-1.*log(p_ij))


[docs]def feature_cooccurrence(corpus, featureset_name, min_weight=1, filter=lambda f, v, c, dc: True): return cooccurrence(corpus, featureset_name, min_weight=min_weight, filter=filter)
[docs]def mutual_information(corpus, featureset_name, min_weight=0.9, filter=lambda f, v, c, dc: True): """ Generates a graph of features in ``featureset`` based on normalized `pointwise mutual information (nPMI) <http://en.wikipedia.org/wiki/Pointwise_mutual_information>`_. .. math:: nPMI(i,j)=\\frac{log(\\frac{p_{ij}}{p_i*p_j})}{-1*log(p_{ij})} ...where :math:`p_i` and :math:`p_j` are the probabilities that features *i* and *j* will occur in a document (independently), and :math:`p_{ij}` is the probability that those two features will occur in the same document. """ graph = feature_cooccurrence(corpus, featureset_name, min_weight=1, filter=filter) mgraph = type(graph)() keep_nodes = set() fset = corpus.features[featureset_name] for s, t, attrs in graph.edges(data=True): p_ij = float(attrs['weight'])/len(corpus.papers) p_i = float(fset.documentCounts[fset.lookup[s]])/len(corpus.papers) p_j = float(fset.documentCounts[fset.lookup[t]])/len(corpus.papers) MI = _nPMI(p_ij, p_i, p_j) if MI >= min_weight: mgraph.add_edge(s, t, nPMI=MI, **attrs) keep_nodes.add(s) keep_nodes.add(t) for n in list(keep_nodes): # Retain node attributes. mgraph.node[n].update(graph.node[n]) return mgraph
[docs]def keyword_cooccurrence(corpus, min_weight=1, filter=lambda f, v, c, dc: True): warnings.warn('keyword_cooccurrence will be removed in v0.8. Use ' + 'feature_cooccurrence with "authorKeywords" or '+ '"keywordsPlus" instead.', DeprecationWarning) return feature_cooccurrence(corpus, 'authorKeywords', min_weight=min_weight, filter=filter)