Source code for tethne.networks.features

"""
Methods for building networks from terms in bibliographic records. This
includes keywords, abstract terms, etc.

.. autosummary::
   :nosignatures:

   cooccurrence
   mutual_information
   keyword_cooccurrence
   topic_coupling
   
"""

import logging
logging.basicConfig(filename=None, format='%(asctime)-6s: %(name)s - %(levelname)s - %(module)s - %(funcName)s - %(lineno)d - %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel('INFO')

import numpy as np
import networkx as nx
from scipy.sparse import coo_matrix
from collections import Counter

def _filter(s, C, DC, N):
    if C > 5 and DC > N*0.05 and len(s) > 4:
        return True
    return False

[docs]def cooccurrence(   papers, featureset, filter=_filter, graph=True,
                    threshold=20, indexed_by='doi', **kwargs    ):
    """
    Generates a cooccurrence graph for features in ``featureset``.
    
    ``filter`` is a method applied to each feature, used to determine whether
    a feature should be included in the graph **before** co-occurrence values
    are generated. This can cut down on computational expense. ``filter`` should
    accept the following parameters:
    
    =========   ================================================================
    Parameter   Description
    =========   ================================================================
    ``s``       Representation of the feature (e.g. a string).
    ``C``       The overall frequency of the feature in the
                :class:`.Corpus`\.
    ``DC``      The number of documents in which the feature occurs.
    ``N``       Total number of documents in the :class:`.Corpus`\.
    =========   ================================================================
    
    The default filter is:
    
    .. code-block:: python
    
       >>> def _filter(s, C, DC, N):
       ...     if C > 5 and DC > N*0.05 and len(s) > 4:
       ...         return True
       ...     return False
    
    
    Parameters
    ----------
    papers : list
        A list of :class:`.Paper` instances.
    featurset : dict
        A featureset from a :class:`.Corpus`\.
    filter : method
        Method applied to each feature; should return True if the feature should
        be included, and False otherwise. See above.
    graph : bool
        (default: True) If False, returns a dictionary of co-occurrence values
        instead of a Graph.
    threshold : int
        (default: 20) Minimum co-occurrence value for inclusion in the Graph. 
        If ``graph`` is False, this has no effect.
    indexed_by : str
        (default: 'doi') Field in :class:`.Paper` used as indexing values in 
        ``featureset``.
        
    Returns
    -------
    networkx.Graph or dict
        See ``graph`` parameter, above.
    """

    if filter is None:  # Turns filtering off (returns True for everything).
        logger.debug('Filtering is disabled.')
        def filter(*args, **kwargs): return True
    
    features = featureset['features']
    index = featureset['index']
    counts = featureset['counts']
    dCounts = featureset['documentCounts']
    
    # Apply the filter to the featureset.
    subset = [ f for f,s in index.iteritems()   # Feature indices and strings.
                    if filter(s, counts[f], dCounts[f], len(papers)) ]

    ecounts = Counter()     # { (f_i, f_j) : int(cc) }
    for paper in papers:
        p = paper[indexed_by]   # features is keyed by paper ID.
        try:                    # Not all papers have features data.
            fvect = features[p]
        except KeyError:        # Raised when there are no data for that paper.
            continue

        feats,vals = zip(*fvect)
        feats = list(set(feats) & set(subset))  # Only keep filtered features.
        Nfeats = len(feats)

        # Generate the upper half of the co-occurrence matrix.
        for i in xrange(1,Nfeats):
            for e in zip(feats, feats[i:]):
                e_ = tuple(sorted(e))   # Matrix is symmetric.
                ecounts[e_] += 1

    if not graph:
        logger.debug('No graph; return coocurrence values only.')
        return ecounts

    else:   # Build an undirected graph, with co-occurrence values as weights.
        logger.debug('Building a graph, with co-occurrence values as weights.')
        g = nx.Graph()
        for e,co in ecounts.iteritems():
            if co >= threshold:
                g.add_edge(e[0], e[1], weight=co)

        # Add node labels.
        labels = { k:v for k,v in index.iteritems() if k in g.nodes() }
        nx.set_node_attributes(g, 'label', labels)

        return g


[docs]def mutual_information( papers, featureset, filter=None,
                        threshold=0.5, indexed_by='doi', **kwargs   ):
    """
    Generates a graph of features in ``featureset`` based on normalized 
    `pointwise mutual information (nPMI)
    <http://en.wikipedia.org/wiki/Pointwise_mutual_information>`_.
    
    .. math::
    
       nPMI(i,j)=\\frac{log(\\frac{p_{ij}}{p_i*p_j})}{-1*log(p_{ij})}
    
    ...where :math:`p_i` and :math:`p_j` are the probabilities that features
    *i* and *j* will occur in a document (independently), and :math:`p_{ij}` is
    the probability that those two features will occur in the same document.
    
    
    Parameters
    ----------
    papers : list
        A list of :class:`.Paper` instances.
    featurset : dict
        A featureset from a :class:`.Corpus`\.
    filter : method
        Method applied to each feature prior to calculating co-occurrence.
        See :func:`.cooccurrence`\.
    threshold : float
        (default: 0.5) Minimum nPMI for inclusion the graph.
    indexed_by : str
        (default: 'doi') Field in :class:`.Paper` used as indexing values in
        ``featureset``.
        
    Returns
    -------
    graph : networkx.Graph
    
    Examples
    --------
    
    Using wordcount data from JSTOR Data-for-Research, we can generate a nPMI
    network as follows:
    
    .. code-block:: python
    
       >>> from tethne.readers import dfr               # Prep corpus.
       >>> MyCorpus = dfr.read_corpus(datapath+'/dfr', features=['uni'])
       >>> MyCorpus.filter_features('unigrams', 'u_filtered')
       >>> corpus.transform('u_filtered', 'u_tfidf')
       
       >>> from tethne.networks import features         # Build graph.
       >>> graph = features.mutual_information(MyCorpus.all_papers(), 'u_tfidf')
       
       >>> from tethne.writers.graph import to_graphml  # Export graph.
       >>> to_graphml(graph, '/path/to/my/graph.graphml')
       
    Here's a small cluster from a similar graph, visualized in Cytoscape:
    
    .. figure:: _static/images/nPMI_phosphorus.png
       :width: 400
       :align: center
       
       Edge weight and opacity indicate nPMI values.
       
    
    """
    logger.debug('Build a pointwise mutual information graph.')
    
    graph = nx.Graph()
    ecounts = cooccurrence( papers, featureset, filter=filter,
                            graph=False, indexed_by=indexed_by  )
    logger.debug('Got {0} cooccurrence values'.format(len(ecounts)))
    
    features = featureset['features']
    index = featureset['index']
    counts = featureset['counts']
    dCounts = featureset['documentCounts']

    for k,v in ecounts.iteritems():
        p_i = min(float(dCounts[k[0]])/float(len(papers)), 1.0)
        p_j = min(float(dCounts[k[1]])/float(len(papers)), 1.0)
        p_ij = min(float(v)/float(len(papers)), 1.0)
        P = _nPMI(p_ij, p_i, p_j)

        if P >= threshold:
            
            graph.add_edge(k[0], k[1], weight=float(P))
    logger.debug('Added {0} edges to graph.'.format(len(graph.edges())))

    labels = { k:v for k,v in index.iteritems() if k in graph.nodes() }
    nx.set_node_attributes(graph, 'label', labels)
    logger.debug('Added labels to {0} nodes.'.format(len(labels)))

    return graph

def _nPMI(p_ij, p_i, p_j):
    return ( np.log( p_ij/(p_i*p_j) ) ) / ( -1* np.log(p_ij) )

[docs]def keyword_cooccurrence(papers, threshold, connected=False, **kwargs):
    """
    Generates a keyword cooccurrence network.

    Parameters
    ----------
    papers : list
        A list of :class:`.Paper` objects.
    threshold : int
        Minimum number of occurrences for a keyword pair to appear in graph.
    connected : bool
        If True, returns only the largest connected component.

    Returns
    -------
    k_coccurrence :  networkx.Graph
        A keyword coccurrence network.
        
    Notes
    -----
    Not thoroughly tested.
    
    **TODO**
    
    * Incorporate this into the featureset framework.

    """

    # Extract keywords from papers.
    keywords = {}
    for entry in papers:
        if 'keywords' in entry.keys():
            keywords[entry['wosid']] = entry['keywords']

    # Generate the complete set of keywords in the dataset.
    wordset = set([])
    for entry in papers:
        try:
            for kw in keywords[entry['wosid']]:
                wordset.add(kw)
        except:
            pass

    # Mapping of integer indices to keywords.
    i = 0
    dictionary = {}
    dictionary_ = {}
    for word in wordset:
        dictionary[word] = i
        dictionary_[i] = word
        i += 1

    cooccurrence = np.zeros((len(wordset), len(wordset)))
    frequencies = np.zeros((len(wordset),))

    for entry in papers:
        if entry['keywords'] in keywords.keys():
            for word in keywords[entry['wosid']]:
                frequencies[dictionary[word]] += 1
                for word_ in keywords[entry['wosid']]:
                    i = dictionary[word]
                    j = dictionary[word_]
                    if i != j:
                        cooccurrence[i, j] += 1

    G = nx.Graph()
    for i in xrange(len(wordset)):
        for j in xrange(i, len(wordset)):
            if cooccurrence[i, j] > 1 and i != j:
                G.add_edge(dictionary_[i], dictionary_[j], weight=int(cooccurrence[i, j]))

    if connected:   # Return only the first connected component.
        return nx.connected_component_subgraphs(G)[0]
    else:
        return G    # Return the whole graph.

[docs]def topic_coupling(model, threshold=0.005, **kwargs):
    """
    Creates a network of words connected by implication in a common topic(s).

    Parameters
    ----------
    model : :class:`.LDAModel`
    threshold : float
        Minimum P(W|T) for coupling.

    Returns
    -------
    tc : networkx.Graph
        A topic-coupling graph, where nodes are terms.
        
    Examples
    --------
    
    .. code-block:: python

      >>> from tethne.networks import features
      >>> g = features.topic_coupling(MyLDAModel, threshold=0.015)

    Here's a similar network, visualized in Cytoscape:
    
    .. image:: _static/images/mallet/semantic_network.png
       :width: 600
       :align: center
       
    For details, see :ref:`mallet-tutorial`.

    """

    Z = model.Z
    W = model.W
    
    logger.debug('topic_coupling for {0} features, {1} topics.'.format(W,Z))
    logger.debug('threshold: {0}'.format(threshold))

    edges = {}
    for z in xrange(Z):
        word_sub = []
        dimension = model.dimension(z, asmatrix=True)
        for w in dimension.nonzero()[1]:
            if dimension[0,w] >= threshold:
                word_sub.append(w)
        
        logger.debug('topic {0} generated {1} edges.'.format(z, len(word_sub)))

        for i in xrange(len(word_sub)):
            for j in xrange(i+1, len(word_sub)):
                w_i = word_sub[i]
                w_j = word_sub[j]
                p_i = dimension[0, w_i]
                p_j = dimension[0, w_j]
                try:
                    edges[(w_i,w_j)].append((z, (p_i+p_j)/2))
                except KeyError:
                    edges[(w_i,w_j)] = [(z, (p_i+p_j)/2)]
                    
    logger.debug('generated {0} edges'.format(len(edges)))
    
    tc = nx.Graph()

    for e, topics in edges.iteritems():
        weight = sum( [ t[1] for t in topics ] ) / Z
        i_id = model.vocabulary[e[0]]
        j_id = model.vocabulary[e[1]]
        tc.add_edge(i_id, j_id, weight=float(weight), 
                                topics=[t[0] for t in topics])
    
    logger.debug('done')

    return tc