Source code for tethne.networks.features

"""
Methods for building networks from terms in bibliographic records. This
includes keywords, abstract terms, etc.

.. autosummary::
   :nosignatures:

   cooccurrence
   mutual_information
   keyword_cooccurrence
   topic_coupling

"""

from math import log
import networkx as nx
import warnings

from tethne.networks.base import cooccurrence, coupling, multipartite

def _nPMI(p_ij, p_i, p_j):
    lower = (-1.*log(p_ij))
    joint = (p_i*p_j)
    if lower == 0. or joint == 0.:
        return 0.
    return (log(p_ij/joint))/(-1.*log(p_ij))


[docs]def feature_cooccurrence(corpus, featureset_name, min_weight=1,
                         filter=lambda f, v, c, dc: True):
    return cooccurrence(corpus, featureset_name, min_weight=min_weight,
                        filter=filter)


[docs]def mutual_information(corpus, featureset_name, min_weight=0.9,
                       filter=lambda f, v, c, dc: True):
    """
    Generates a graph of features in ``featureset`` based on normalized
    `pointwise mutual information (nPMI)
    <http://en.wikipedia.org/wiki/Pointwise_mutual_information>`_.

    .. math::

       nPMI(i,j)=\\frac{log(\\frac{p_{ij}}{p_i*p_j})}{-1*log(p_{ij})}

    ...where :math:`p_i` and :math:`p_j` are the probabilities that features
    *i* and *j* will occur in a document (independently), and :math:`p_{ij}` is
    the probability that those two features will occur in the same document.
    """

    graph = feature_cooccurrence(corpus, featureset_name, min_weight=1,
                                 filter=filter)
    mgraph = type(graph)()
    keep_nodes = set()

    fset = corpus.features[featureset_name]
    for s, t, attrs in graph.edges(data=True):
        p_ij = float(attrs['weight'])/len(corpus.papers)
        p_i = float(fset.documentCounts[fset.lookup[s]])/len(corpus.papers)
        p_j = float(fset.documentCounts[fset.lookup[t]])/len(corpus.papers)
        MI = _nPMI(p_ij, p_i, p_j)
        if MI >= min_weight:
            mgraph.add_edge(s, t, nPMI=MI, **attrs)
            keep_nodes.add(s)
            keep_nodes.add(t)

    for n in list(keep_nodes):  # Retain node attributes.
        mgraph.node[n].update(graph.node[n])

    return mgraph

[docs]def keyword_cooccurrence(corpus, min_weight=1, filter=lambda f, v, c, dc: True):
    warnings.warn('keyword_cooccurrence will be removed in v0.8. Use ' +
                  'feature_cooccurrence with "authorKeywords" or '+
                  '"keywordsPlus" instead.', DeprecationWarning)
    return feature_cooccurrence(corpus, 'authorKeywords',
                                min_weight=min_weight, filter=filter)