
Source code for tethne.analyze.corpus

Methods for analyzing :class:`.Corpus` objects.

.. autosummary::

import networkx
import numpy
from ..networks.helpers import top_cited
from ..classes import GraphCollection

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def _forward(X, s=1.1, gamma=1., k=5):
    Forward dynamic algorithm for burstness automaton HMM, from `Kleinberg
    (2002) <>`_.
    X : list
        A series of time-gaps between events.
    s : float
        (default: 1.1) Scaling parameter ( > 1.)that controls graininess of 
        burst detection. Lower values make the model more sensitive.
    gamma : float
        (default: 1.0) Parameter that controls the 'cost' of higher burst 
        states. Higher values make it more 'difficult' to achieve a higher
        burst state.
    k : int
        (default: 5) Number of states. Higher values increase computational
        cost of the algorithm. A maximum of 25 is suggested by the literature.
    states : list
        Optimal state sequence.

    def alpha(i):
        return (n/T)*(s**i)

    def tau(i,j):
        if j > i:
            return (j-i)*gamma*numpy.log(n)
        return 0.

    def f(j,x):
        return alpha(j) * numpy.exp(-1. * alpha(j) * x)

    def C(j,t):
        if j == 0 and t == 0:
            return 0.
        elif t == 0:
            return numpy.inf
        return ( -1.* numpy.log(f(j,X[t])) ) + \
                numpy.min( [ C_values[l,t-1] + tau(l,j) for l in xrange(k) ] )

    T = float(numpy.sum(X))
    n = len(X)
    C_values = numpy.zeros((k,n))
    for j in xrange(k):
        for t in xrange(len(X)):
            C_values[j,t] = C(j,t)
    states = []
    for t in xrange(n):
        state = numpy.argmin(C_values[:,t])

    return states

def _top_features(corpus, feature, topn=20, perslice=False, axis='date'):
    if perslice:
        top = []
        for key, papers in corpus.get_slices(axis).iteritems():
            scounts = corpus.feature_counts(feature, key, axis,
            scvalues = numpy.array(scounts.values())
            top += [scounts.keys()[c] for c in scvalues.argsort()[-topn:][::-1]]
        counts = corpus.features[feature]['counts']
        cvalues = numpy.array(counts.values())
        top = [ counts.keys()[c] for c in cvalues.argsort()[-topn:][::-1] ]
    return top

[docs]def plot_burstness(corpus, feature, k=5, topn=20, perslice=False, flist=None, normalize=True, fig=None, **kwargs): """ Generate a figure depicting burstness profiles for ``feature``. Parameters ---------- corpus : :class:`.Corpus` feature : str Name of featureset in ``corpus``. E.g. ``'citations'``. k : int (default: 5) Number of burst states. topn : int or float {0.-1.} (default: 20) Number (int) or percentage (float) of top-occurring features to return. If ``flist`` is provided, this parameter is ignored. perslice : bool (default: False) If True, loads ``topn`` features per slice. Otherwise, loads ``topn`` features overall. If ``flist`` is provided, this parameter is ignored. flist : list List of features. If provided, ``topn`` and ``perslice`` are ignored. normalize : bool (default: True) If True, burstness is expressed relative to the hightest possible state (``k-1``). Otherwise, states themselves are returned. fig : :class:`matplotlib.figure.Figure` (default: None) You may provide a Figure instance if you wish. Otherwise, a new figure is generated. kwargs : kwargs Parameters for burstness automaton HMM. Returns ------- fig : :class:`matplotlib.figure.Figure` Examples -------- .. code-block:: python >>> from tethne.analyze.corpus import burstness >>> fig = plot_burstness(corpus, 'citations', topn=2, perslice=True) >>> fig.savefig('~/burstness.png') Years prior to the first occurrence of each feature are grayed out. Periods in which the feature was bursty are depicted by colored blocks, the opacity of which indicates burstness intensity. .. figure:: _static/images/burstness.png :width: 600 :align: center """ B = burstness(corpus, feature, k=k, topn=topn, perslice=perslice, flist=flist, normalize=normalize, **kwargs) color = kwargs.get('color', 'red') # Get width based on slices. years = sorted(corpus.axes['date'].keys()) width = years[1] - years[0] height = 1.0 if fig is None: fig = plt.figure(figsize=(10,len(B)/4.)) f = 1 axes = {} for key, value in B.iteritems(): x,y = value ax = fig.add_subplot(len(B),1,f) f+=1 ax.set_yticks([]) ax.set_xbound(min(years), max(years) + 1) if not f == len(B)+1: # Only show xticks on the bottom subplot. ax.set_xticklabels([]) # Block out years until first occurrence of feature. rect = mpatches.Rectangle( (min(years),0), sorted(x)[1]-min(years), height, fill=True, linewidth=0.0 ) rect.set_facecolor('black') rect.set_alpha(0.3) ax.add_patch(rect) # Add a rectangle for each year, shaded according to burstness state. for d in xrange(min(x), max(x)): try: i = x.index(d) except ValueError: continue xy = (d, 0.) state = y[i] rect = mpatches.Rectangle( xy, width, height, fill=True, linewidth=0.0 ) rect.set_facecolor(color) rect.set_alpha(state) ax.add_patch(rect) ax.set_ylabel( key, rotation=0, horizontalalignment='right', verticalalignment='center' ) plt.subplots_adjust(left=0.5) fig.tight_layout(h_pad=0.25) return fig
[docs]def burstness(corpus, feature, k=5, topn=20, perslice=False, flist=None, normalize=True, **kwargs): """ Estimate burstness profile for the ``topn`` features (or ``flist``) in ``feature``. Uses the popular burstness automaton model inroduced by `Kleinberg (2002) <>`_. Parameters ---------- corpus : :class:`.Corpus` feature : str Name of featureset in ``corpus``. E.g. ``'citations'``. k : int (default: 5) Number of burst states. topn : int or float {0.-1.} (default: 20) Number (int) or percentage (float) of top-occurring features to return. If ``flist`` is provided, this parameter is ignored. perslice : bool (default: False) If True, loads ``topn`` features per slice. Otherwise, loads ``topn`` features overall. If ``flist`` is provided, this parameter is ignored. flist : list List of features. If provided, ``topn`` and ``perslice`` are ignored. normalize : bool (default: True) If True, burstness is expressed relative to the hightest possible state (``k-1``). Otherwise, states themselves are returned. kwargs : kwargs Parameters for burstness automaton HMM. Returns ------- B : dict Keys are features, values are tuples of ( dates, burstness ) Examples -------- .. code-block:: python >>> from tethne.analyze.corpus import burstness >>> B = burstness(corpus, 'abstractTerms', flist=['process', 'method'] >>> B['process'] ([1990, 1991, 1992, 1993], [0., 0.4, 0.6, 0.]) """ if flist is None: top = _top_features(corpus, feature, topn=topn, perslice=perslice) else: lookup = {v:k for k,v in corpus.features[feature]['index'].iteritems()} top = [] for f in flist: # Get feature indices. try: # Ignore features that don't exist. top.append(lookup[f]) except KeyError: pass B = {} for f in top: # top is a list of feature indices. feat = corpus.features[feature]['index'][f] B[feat] = feature_burstness(corpus, feature, f, k=k, normalize=normalize, **kwargs) return B
[docs]def feature_burstness(corpus, feature, findex, k=5, normalize=True, **kwargs): """ Estimate burstness profile for a feature over the ``'date'`` axis. Parameters ---------- corpus : :class:`.Corpus` feature : str Name of featureset in ``corpus``. E.g. ``'citations'``. findex : int Index of ``feature`` in ``corpus``. k : int (default: 5) Number of burst states. normalize : bool (default: True) If True, burstness is expressed relative to the hightest possible state (``k-1``). Otherwise, states themselves are returned. kwargs : kwargs Parameters for burstness automaton HMM. """ # Get time-intervals between occurrences. last = min(corpus.axes['date'].keys())-1 dates = [last] # Pad start. X_ = [1.] for y,s_ in corpus.get_slices('date').iteritems(): this = [] for p in s_: try: # Not all papers have features. f_ = zip(*corpus.features[feature]['features'][p])[0] if findex in f_: this.append(p) except KeyError: continue N = len(this) if N == 0: continue if y == last + 1: for n_ in xrange(N): X_.append(1./float(N)) dates.append(y) else: X_.append(float(y - last)) dates.append(y) last = int(y) # Get optimum state sequence. st = _forward(numpy.array(X_)*100, **kwargs) # Bin by date. A = {} for i in xrange(len(X_)): d = dates[i] if i not in A: A[d] = [] A[d].append(st[i]) # Get mean burstness for each year. for key, values in A.iteritems(): A[key] = numpy.mean(values) # Normalize. if normalize: A_ = { key:float(v)/k for key,v in A.iteritems() } else: A_ = A D = sorted(A.keys()) return D, [ A_[d] for d in D ]
[docs]def plot_sigma(G, corpus, feature, topn=20, sort_by='max', perslice=False, flist=None, fig=None, **kwargs): """ Plot sigma values for the ``topn`` most influential nodes. Parameters ---------- G : :class:`.GraphCollection` corpus : :class:`.Corpus` feature : str Name of a featureset in `corpus`. topn : int or float {0.-1.} (default: 20) Number (int) or percentage (float) of top-occurring features to return. If ``flist`` is provided, this parameter is ignored. sort_by : str (default: 'max') Criterion for selecting ``topn`` nodes. perslice : bool (default: False) If True, loads ``topn`` features per slice. Otherwise, loads ``topn`` features overall. If ``flist`` is provided, this parameter is ignored. flist : list List of nodes. If provided, ``topn`` and ``perslice`` are ignored. fig : :class:`matplotlib.figure.Figure` (default: None) You may provide a Figure instance if you wish. Otherwise, a new figure is generated. Returns ------- fig : :class:`matplotlib.figure.Figure` G : :class:`.GraphCollection` A co-citation graph collection, updated with ``sigma`` node attributes. Examples -------- Assuming that you have a :class:`.Corpus` (``G``) sliced by ``'date'`` and a co-citation :class:`.GraphCollection` (``corpus``)... .. code-block:: python >>> from tethne.analyze.cocitation import plot_sigma >>> fig,G = plot_sigma(G, corpus, topn=5, perslice=True) >>> fig.savefig('~/sigma_plot.png') In this figure, the top 5 most sigma-influential nodes in each slice are shown. Red bands indicate periods in which each paper was influential; opacity indicates the intensity of sigma (normalized by the highest value in the plot). The period prior to the first instance of each node is grayed out. .. figure:: _static/images/sigma_plot.png :width: 600 :align: center """ G = sigma(G, corpus, feature) nodes = G.nodes() color = kwargs.get('color', 'red') years = sorted(corpus.axes['date'].keys()) width = years[1] - years[0] # Get width based on slices. height = 1.0 # Get node histories for sigma. histories = {} if flist is not None: nodes = flist for node in nodes: histories[node] = G.node_history(node, 'sigma') if flist is not None: these_nodes = flist # Use provided list of nodes. else: # Get only the topn most significant papers. include = [] if sort_by == 'max': if perslice: # Get topn per slice. vals = {} norm_by = 0. # Organize values in a way that makes selection easier. for node in nodes: if max(histories[node].values()) == 0.: continue for year,val in histories[node].iteritems(): try: vals[year][node] = val except KeyError: vals[year] = { node:val } # Get the maximum values for each slice. for year in vals.keys(): vals_ = numpy.array(vals[year].values()) indices = vals_.argsort()[-topn:][::-1] include += [ vals[year].keys()[i] for i in indices ] if numpy.max(vals_) > norm_by: norm_by = numpy.max(vals_) else: # Get topn overall. maxes = numpy.array([ max(v.values()) for v in histories.values() ]) indices = maxes.argsort()[-topn:][::-1] include = [ histories.keys()[i] for i in indices ] norm_by = numpy.max(maxes) # Nodes to include. these_nodes = [ node for node in nodes if max(histories[node].values()) > 0 and node in include ] if fig is None: # Create a new Figure instance. fig = plt.figure(figsize=(10,len(these_nodes)/4.)) # Plot! f = 1 # Current subplot. axes = {} x_min = min([min(v.keys()) for v in histories.values()]) for node in these_nodes: x = sorted(histories[node].keys()) y = numpy.array([ histories[node][i] for i in x ])/norm_by ax = fig.add_subplot(len(these_nodes),1,f) f+=1 ax.set_yticks([]) ax.set_xbound(x_min, max(years)+1) # Only show xticks on the bottom subplot. if not f == len(these_nodes) + 1: ax.set_xticklabels([]) # Block out years until first occurrence of feature. rect = mpatches.Rectangle( (min(years),0), sorted(x)[0]-min(years), height, fill=True, linewidth=0.0 ) rect.set_facecolor('black') rect.set_alpha(0.1) ax.add_patch(rect) # Add a rectangle for each year, shaded according to burstness state. for d in xrange(min(x), max(x)): try: # May not have values for all years. i = x.index(d) except ValueError: continue xy = (d, 0.) state = y[i] rect = mpatches.Rectangle( xy, width, height, fill=True, linewidth=0.0 ) rect.set_facecolor(color) rect.set_alpha(state + 0.1) ax.add_patch(rect) ax.set_ylabel( G.node_index[node], rotation=0, horizontalalignment='right', verticalalignment='center' ) plt.subplots_adjust(left=0.5) fig.tight_layout(h_pad=0.25) return fig, G
[docs]def sigma(G, corpus, feature, **kwargs): """ Calculate sigma (from `Chen 2009 <>`_) for all of the nodes in a :class:`.GraphCollection`\. You can set parameters for burstness estimation using ``kwargs``: ========= ================================================================ Parameter Description ========= ================================================================ s Scaling parameter ( > 1.)that controls graininess of burst detection. Lower values make the model more sensitive. Defaults to 1.1. gamma Parameter that controls the 'cost' of higher burst states. Defaults to 1.0. k Number of burst states. Defaults to 5. ========= ================================================================ Parameters ---------- G : :class:`.GraphCollection` corpus : :class:`.Corpus` feature : str Name of a featureset in `corpus`. Returns ------- G : :class:`.GraphCollection` A graph collection updated with ``sigma`` node attributes. Examples -------- Assuming that you have a :class:`.Corpus` generated from WoS data that has been sliced by ``date``. .. code-block:: python >>> # Generate a co-citation graph collection. >>> from tethne import GraphCollection >>> kwargs = { 'threshold':2, 'topn':100 } >>> G = GraphCollection() >>>, 'date', 'papers', 'cocitation', method_kwargs=kwargs) >>> # Calculate sigma. This may take several minutes, depending on the >>> # size of your co-citaiton graph collection. >>> from tethne.analyze.corpus import sigma >>> G = sigma(G, corpus, 'citations') >>> # Visualize... >>> from tethne.writers import collection >>> collection.to_dxgmml(G, '~/cocitation.xgmml') In the visualization below, node and label sizes are mapped to ``sigma``, and border width is mapped to ``citations``. .. figure:: _static/images/cocitation_sigma2.png :width: 600 :align: center """ nodes = G.node_lookup.keys() B = burstness(corpus, feature, flist=nodes, **kwargs) B_ = {} for key, values in B.iteritems(): dates,bursts = values B_[key] = { dates[i]:bursts[i] for i in xrange(len(dates)) } for key in sorted(G.graphs.keys()): centrality = networkx.betweenness_centrality(G[key]) sigma = {} for n in G[key].nodes(): n_ = G.node_index[n] sigma[n] = ( ( centrality[n] + 1 ) ** B_[n_][key] ) - 1. # Update graph. networkx.set_node_attributes(G[key], 'sigma', sigma) networkx.set_node_attributes(G[key], 'centrality', centrality) return G