SciPy

Source code for tethne.analyze.corpus

"""
Methods for analyzing :class:`.Corpus` objects.

.. autosummary::
   :nosignatures:

   burstness
   feature_burstness
   sigma

"""

import networkx as nx
import warnings

from math import exp, log
from collections import defaultdict
from itertools import izip

from tethne.utilities import argmin, mean

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str
    xrange = range



def _forward(X, s=1.1, gamma=1., k=5):
    """
    Forward dynamic algorithm for burstness automaton HMM, from `Kleinberg
    (2002) <http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_.

    Parameters
    ----------
    X : list
        A series of time-gaps between events.
    s : float
        (default: 1.1) Scaling parameter ( > 1.)that controls graininess of
        burst detection. Lower values make the model more sensitive.
    gamma : float
        (default: 1.0) Parameter that controls the 'cost' of higher burst
        states. Higher values make it more 'difficult' to achieve a higher
        burst state.
    k : int
        (default: 5) Number of states. Higher values increase computational
        cost of the algorithm. A maximum of 25 is suggested by the literature.

    Returns
    -------
    states : list
        Optimal state sequence.
    """
    X = list(X)

    def alpha(i):
        return (n/T)*(s**i)

    def tau(i, j):
        if j > i:
            return (j-i)*gamma*log(n)
        return 0.

    def f(j, x):
        return alpha(j) * exp(-1. * alpha(j) * x)

    def C(j, t):
        if j == 0 and t == 0:
            return 0.
        elif t == 0:
            return float("inf")

        C_tau = min([C_values[l][t-1] + tau(l, j) for l in xrange(k)])
        return (-1. * log(f(j,X[t]))) + C_tau

    T = sum(X)
    n = len(X)

    # C() requires default (0) values, so we construct the "array" in advance.
    C_values = [[0 for t in xrange(len(X))] for j in xrange(k)]
    for j in xrange(k):
        for t in xrange(len(X)):
            C_values[j][t] = C(j,t)

    # Find the optimal state sequence.
    states = [argmin([c[t] for c in C_values]) for t in xrange(n)]
    return states

def _top_features(corpus, feature, topn=20, perslice=False, axis='date'):
    warnings.warn("Removed in 0.8. Use corpus.top_features() instead.",
                  DeprecationWarning)
    return corpus.top_features(feature, topn=topn, perslice=perslice)

[docs]def burstness(corpus, featureset_name, features=[], k=5, topn=20, perslice=False, normalize=True, **kwargs): """ Estimate burstness profile for the ``topn`` features (or ``flist``) in ``feature``. Uses the popular burstness automaton model inroduced by `Kleinberg (2002) <http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_. Parameters ---------- corpus : :class:`.Corpus` feature : str Name of featureset in ``corpus``. E.g. ``'citations'``. k : int (default: 5) Number of burst states. topn : int or float {0.-1.} (default: 20) Number (int) or percentage (float) of top-occurring features to return. If ``flist`` is provided, this parameter is ignored. perslice : bool (default: False) If True, loads ``topn`` features per slice. Otherwise, loads ``topn`` features overall. If ``flist`` is provided, this parameter is ignored. flist : list List of features. If provided, ``topn`` and ``perslice`` are ignored. normalize : bool (default: True) If True, burstness is expressed relative to the hightest possible state (``k-1``). Otherwise, states themselves are returned. kwargs : kwargs Parameters for burstness automaton HMM. Returns ------- B : dict Keys are features, values are tuples of ( dates, burstness ) Examples -------- .. code-block:: python >>> from tethne.analyze.corpus import burstness >>> B = burstness(corpus, 'abstractTerms', flist=['process', 'method'] >>> B['process'] ([1990, 1991, 1992, 1993], [0., 0.4, 0.6, 0.]) """ # If `features` of interest are not specified, calculate burstness for the # top `topn` features. if len(features) == 0: T = corpus.top_features(featureset_name, topn=topn, perslice=perslice) if perslice: features = list(set([f[0] for flist in zip(*T)[1] for f in flist])) else: features = list(zip(*T))[0] B = {feature: feature_burstness(corpus, featureset_name, feature, k=k, normalize=normalize, **kwargs) for feature in features} return B
[docs]def feature_burstness(corpus, featureset_name, feature, k=5, normalize=True, s=1.1, gamma=1., **slice_kwargs): """ Estimate burstness profile for a feature over the ``'date'`` axis. Parameters ---------- corpus : :class:`.Corpus` feature : str Name of featureset in ``corpus``. E.g. ``'citations'``. findex : int Index of ``feature`` in ``corpus``. k : int (default: 5) Number of burst states. normalize : bool (default: True) If True, burstness is expressed relative to the hightest possible state (``k-1``). Otherwise, states themselves are returned. kwargs : kwargs Parameters for burstness automaton HMM. """ if featureset_name not in corpus.features: corpus.index_feature(featureset_name) if 'date' not in corpus.indices: corpus.index('date') # Get time-intervals between occurrences. dates = [min(corpus.indices['date'].keys()) - 1] # Pad start. X_ = [1.] years, values = corpus.feature_distribution(featureset_name, feature) for year, N in izip(years, values): if N == 0: continue if N > 1: if year == dates[-1] + 1: for n in xrange(int(N)): X_.append(1./N) dates.append(year) else: X_.append(float(year - dates[-1])) dates.append(year) for n in xrange(int(N) - 1): X_.append(1./(N - 1)) dates.append(year) else: X_.append(float(year - dates[-1])) dates.append(year) # Get optimum state sequence. st = _forward(map(lambda x: x*100, X_), s=s, gamma=gamma, k=k) # Bin by date. A = defaultdict(list) for i in xrange(len(X_)): A[dates[i]].append(st[i]) # Normalize. if normalize: A = {key: mean(values)/k for key, values in A.items()} else: A = {key: mean(values) for key, values in A.items()} D = sorted(A.keys()) return D[1:], [A[d] for d in D[1:]]
[docs]def sigma(G, corpus, featureset_name, B=None, **kwargs): """ Calculate sigma (from `Chen 2009 <http://arxiv.org/pdf/0904.1439.pdf>`_) for all of the nodes in a :class:`.GraphCollection`\. You can set parameters for burstness estimation using ``kwargs``: ========= =============================================================== Parameter Description ========= =============================================================== s Scaling parameter ( > 1.)that controls graininess of burst detection. Lower values make the model more sensitive. Defaults to 1.1. gamma Parameter that controls the 'cost' of higher burst states. Defaults to 1.0. k Number of burst states. Defaults to 5. ========= =============================================================== Parameters ---------- G : :class:`.GraphCollection` corpus : :class:`.Corpus` feature : str Name of a featureset in `corpus`. Examples -------- Assuming that you have a :class:`.Corpus` generated from WoS data that has been sliced by ``date``. .. code-block:: python >>> # Generate a co-citation graph collection. >>> from tethne import GraphCollection >>> kwargs = { 'threshold':2, 'topn':100 } >>> G = GraphCollection() >>> G.build(corpus, 'date', 'papers', 'cocitation', method_kwargs=kwargs) >>> # Calculate sigma. This may take several minutes, depending on the >>> # size of your co-citaiton graph collection. >>> from tethne.analyze.corpus import sigma >>> G = sigma(G, corpus, 'citations') >>> # Visualize... >>> from tethne.writers import collection >>> collection.to_dxgmml(G, '~/cocitation.xgmml') In the visualization below, node and label sizes are mapped to ``sigma``, and border width is mapped to ``citations``. .. figure:: _static/images/cocitation_sigma2.png :width: 600 :align: center """ if 'date' not in corpus.indices: corpus.index('date') # Calculate burstness if not provided. if not B: B = burstness(corpus, featureset_name, features=G.nodes(), **kwargs) Sigma = {} # Keys are dates (from GraphCollection), values are # node:sigma dicts. for key, graph in G.iteritems(): centrality = nx.betweenness_centrality(graph) sigma = {} # Sigma values for all features in this year. attrs = {} # Sigma values for only those features in this graph. for n_, burst in B.iteritems(): burst = dict(list(zip(*burst))) # Reorganize for easier lookup. # Nodes are indexed as integers in the GraphCollection. n = G.node_lookup[n_] # We have burstness values for years in which the feature ``n`` # occurs, and we have centrality values for years in which ``n`` # made it into the graph. if n in graph.nodes() and key in burst: sigma[n] = ((centrality[n] + 1.) ** burst[key]) - 1. attrs[n] = sigma[n] # Update graph with sigma values. nx.set_node_attributes(graph, 'sigma', attrs) Sigma[key] = sigma # Invert results and update the GraphCollection.master_graph. # TODO: is there a more efficient way to do this? inverse = defaultdict(dict) for gname, result in Sigma.iteritems(): if hasattr(result, '__iter__'): for n, val in result.iteritems(): inverse[n].update({gname: val}) nx.set_node_attributes(G.master_graph, 'sigma', inverse) # We want to return results in the same format as burstness(); with node # labels as keys; values are tuples ([years...], [sigma...]). return {n: list(zip(*G.node_history(G.node_lookup[n], 'sigma').items())) for n in B.keys()}