"""
Methods for analyzing :class:`.Corpus` objects.
.. autosummary::
:nosignatures:
burstness
feature_burstness
plot_burstness
plot_sigma
sigma
"""
import networkx
import numpy
from ..networks.helpers import top_cited
from ..classes import GraphCollection
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
def _forward(X, s=1.1, gamma=1., k=5):
"""
Forward dynamic algorithm for burstness automaton HMM, from `Kleinberg
(2002) <http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_.
Parameters
----------
X : list
A series of time-gaps between events.
s : float
(default: 1.1) Scaling parameter ( > 1.)that controls graininess of
burst detection. Lower values make the model more sensitive.
gamma : float
(default: 1.0) Parameter that controls the 'cost' of higher burst
states. Higher values make it more 'difficult' to achieve a higher
burst state.
k : int
(default: 5) Number of states. Higher values increase computational
cost of the algorithm. A maximum of 25 is suggested by the literature.
Returns
-------
states : list
Optimal state sequence.
"""
def alpha(i):
return (n/T)*(s**i)
def tau(i,j):
if j > i:
return (j-i)*gamma*numpy.log(n)
return 0.
def f(j,x):
return alpha(j) * numpy.exp(-1. * alpha(j) * x)
def C(j,t):
if j == 0 and t == 0:
return 0.
elif t == 0:
return numpy.inf
return ( -1.* numpy.log(f(j,X[t])) ) + \
numpy.min( [ C_values[l,t-1] + tau(l,j) for l in xrange(k) ] )
T = float(numpy.sum(X))
n = len(X)
C_values = numpy.zeros((k,n))
for j in xrange(k):
for t in xrange(len(X)):
C_values[j,t] = C(j,t)
states = []
for t in xrange(n):
state = numpy.argmin(C_values[:,t])
states.append(state)
return states
def _top_features(corpus, feature, topn=20, perslice=False, axis='date'):
if perslice:
top = []
for key, papers in corpus.get_slices(axis).iteritems():
scounts = corpus.feature_counts(feature, key, axis,
documentCounts=True)
scvalues = numpy.array(scounts.values())
top += [scounts.keys()[c] for c in scvalues.argsort()[-topn:][::-1]]
else:
counts = corpus.features[feature]['counts']
cvalues = numpy.array(counts.values())
top = [ counts.keys()[c] for c in cvalues.argsort()[-topn:][::-1] ]
return top
[docs]def plot_burstness(corpus, feature, k=5, topn=20, perslice=False,
flist=None, normalize=True, fig=None, **kwargs):
"""
Generate a figure depicting burstness profiles for ``feature``.
Parameters
----------
corpus : :class:`.Corpus`
feature : str
Name of featureset in ``corpus``. E.g. ``'citations'``.
k : int
(default: 5) Number of burst states.
topn : int or float {0.-1.}
(default: 20) Number (int) or percentage (float) of top-occurring
features to return. If ``flist`` is provided, this parameter is ignored.
perslice : bool
(default: False) If True, loads ``topn`` features per slice. Otherwise,
loads ``topn`` features overall. If ``flist`` is provided, this
parameter is ignored.
flist : list
List of features. If provided, ``topn`` and ``perslice`` are ignored.
normalize : bool
(default: True) If True, burstness is expressed relative to the hightest
possible state (``k-1``). Otherwise, states themselves are returned.
fig : :class:`matplotlib.figure.Figure`
(default: None) You may provide a Figure instance if you wish.
Otherwise, a new figure is generated.
kwargs : kwargs
Parameters for burstness automaton HMM.
Returns
-------
fig : :class:`matplotlib.figure.Figure`
Examples
--------
.. code-block:: python
>>> from tethne.analyze.corpus import burstness
>>> fig = plot_burstness(corpus, 'citations', topn=2, perslice=True)
>>> fig.savefig('~/burstness.png')
Years prior to the first occurrence of each feature are grayed out. Periods
in which the feature was bursty are depicted by colored blocks, the opacity
of which indicates burstness intensity.
.. figure:: _static/images/burstness.png
:width: 600
:align: center
"""
B = burstness(corpus, feature, k=k, topn=topn, perslice=perslice,
flist=flist, normalize=normalize, **kwargs)
color = kwargs.get('color', 'red')
# Get width based on slices.
years = sorted(corpus.axes['date'].keys())
width = years[1] - years[0]
height = 1.0
if fig is None:
fig = plt.figure(figsize=(10,len(B)/4.))
f = 1
axes = {}
for key, value in B.iteritems():
x,y = value
ax = fig.add_subplot(len(B),1,f)
f+=1
ax.set_yticks([])
ax.set_xbound(min(years), max(years) + 1)
if not f == len(B)+1: # Only show xticks on the bottom subplot.
ax.set_xticklabels([])
# Block out years until first occurrence of feature.
rect = mpatches.Rectangle( (min(years),0), sorted(x)[1]-min(years),
height, fill=True, linewidth=0.0 )
rect.set_facecolor('black')
rect.set_alpha(0.3)
ax.add_patch(rect)
# Add a rectangle for each year, shaded according to burstness state.
for d in xrange(min(x), max(x)):
try:
i = x.index(d)
except ValueError:
continue
xy = (d, 0.)
state = y[i]
rect = mpatches.Rectangle( xy, width, height, fill=True,
linewidth=0.0 )
rect.set_facecolor(color)
rect.set_alpha(state)
ax.add_patch(rect)
ax.set_ylabel( key, rotation=0,
horizontalalignment='right',
verticalalignment='center' )
plt.subplots_adjust(left=0.5)
fig.tight_layout(h_pad=0.25)
return fig
[docs]def burstness(corpus, feature, k=5, topn=20, perslice=False,
flist=None, normalize=True, **kwargs):
"""
Estimate burstness profile for the ``topn`` features (or ``flist``) in
``feature``.
Uses the popular burstness automaton model inroduced by `Kleinberg (2002)
<http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_.
Parameters
----------
corpus : :class:`.Corpus`
feature : str
Name of featureset in ``corpus``. E.g. ``'citations'``.
k : int
(default: 5) Number of burst states.
topn : int or float {0.-1.}
(default: 20) Number (int) or percentage (float) of top-occurring
features to return. If ``flist`` is provided, this parameter is ignored.
perslice : bool
(default: False) If True, loads ``topn`` features per slice. Otherwise,
loads ``topn`` features overall. If ``flist`` is provided, this
parameter is ignored.
flist : list
List of features. If provided, ``topn`` and ``perslice`` are ignored.
normalize : bool
(default: True) If True, burstness is expressed relative to the hightest
possible state (``k-1``). Otherwise, states themselves are returned.
kwargs : kwargs
Parameters for burstness automaton HMM.
Returns
-------
B : dict
Keys are features, values are tuples of ( dates, burstness )
Examples
--------
.. code-block:: python
>>> from tethne.analyze.corpus import burstness
>>> B = burstness(corpus, 'abstractTerms', flist=['process', 'method']
>>> B['process']
([1990, 1991, 1992, 1993], [0., 0.4, 0.6, 0.])
"""
if flist is None:
top = _top_features(corpus, feature, topn=topn, perslice=perslice)
else:
lookup = {v:k for k,v in corpus.features[feature]['index'].iteritems()}
top = []
for f in flist: # Get feature indices.
try: # Ignore features that don't exist.
top.append(lookup[f])
except KeyError:
pass
B = {}
for f in top: # top is a list of feature indices.
feat = corpus.features[feature]['index'][f]
B[feat] = feature_burstness(corpus, feature, f, k=k,
normalize=normalize, **kwargs)
return B
[docs]def feature_burstness(corpus, feature, findex, k=5, normalize=True, **kwargs):
"""
Estimate burstness profile for a feature over the ``'date'`` axis.
Parameters
----------
corpus : :class:`.Corpus`
feature : str
Name of featureset in ``corpus``. E.g. ``'citations'``.
findex : int
Index of ``feature`` in ``corpus``.
k : int
(default: 5) Number of burst states.
normalize : bool
(default: True) If True, burstness is expressed relative to the hightest
possible state (``k-1``). Otherwise, states themselves are returned.
kwargs : kwargs
Parameters for burstness automaton HMM.
"""
# Get time-intervals between occurrences.
last = min(corpus.axes['date'].keys())-1
dates = [last] # Pad start.
X_ = [1.]
for y,s_ in corpus.get_slices('date').iteritems():
this = []
for p in s_:
try: # Not all papers have features.
f_ = zip(*corpus.features[feature]['features'][p])[0]
if findex in f_:
this.append(p)
except KeyError:
continue
N = len(this)
if N == 0:
continue
if y == last + 1:
for n_ in xrange(N):
X_.append(1./float(N))
dates.append(y)
else:
X_.append(float(y - last))
dates.append(y)
last = int(y)
# Get optimum state sequence.
st = _forward(numpy.array(X_)*100, **kwargs)
# Bin by date.
A = {}
for i in xrange(len(X_)):
d = dates[i]
if i not in A:
A[d] = []
A[d].append(st[i])
# Get mean burstness for each year.
for key, values in A.iteritems():
A[key] = numpy.mean(values)
# Normalize.
if normalize:
A_ = { key:float(v)/k for key,v in A.iteritems() }
else: A_ = A
D = sorted(A.keys())
return D, [ A_[d] for d in D ]
[docs]def plot_sigma(G, corpus, feature, topn=20, sort_by='max', perslice=False,
flist=None, fig=None, **kwargs):
"""
Plot sigma values for the ``topn`` most influential nodes.
Parameters
----------
G : :class:`.GraphCollection`
corpus : :class:`.Corpus`
feature : str
Name of a featureset in `corpus`.
topn : int or float {0.-1.}
(default: 20) Number (int) or percentage (float) of top-occurring
features to return. If ``flist`` is provided, this parameter is ignored.
sort_by : str
(default: 'max') Criterion for selecting ``topn`` nodes.
perslice : bool
(default: False) If True, loads ``topn`` features per slice. Otherwise,
loads ``topn`` features overall. If ``flist`` is provided, this
parameter is ignored.
flist : list
List of nodes. If provided, ``topn`` and ``perslice`` are ignored.
fig : :class:`matplotlib.figure.Figure`
(default: None) You may provide a Figure instance if you wish.
Otherwise, a new figure is generated.
Returns
-------
fig : :class:`matplotlib.figure.Figure`
G : :class:`.GraphCollection`
A co-citation graph collection, updated with ``sigma`` node attributes.
Examples
--------
Assuming that you have a :class:`.Corpus` (``G``) sliced by ``'date'`` and a
co-citation :class:`.GraphCollection` (``corpus``)...
.. code-block:: python
>>> from tethne.analyze.cocitation import plot_sigma
>>> fig,G = plot_sigma(G, corpus, topn=5, perslice=True)
>>> fig.savefig('~/sigma_plot.png')
In this figure, the top 5 most sigma-influential nodes in each slice are
shown. Red bands indicate periods in which each paper was influential;
opacity indicates the intensity of sigma (normalized by the highest value in
the plot). The period prior to the first instance of each node is grayed
out.
.. figure:: _static/images/sigma_plot.png
:width: 600
:align: center
"""
G = sigma(G, corpus, feature)
nodes = G.nodes()
color = kwargs.get('color', 'red')
years = sorted(corpus.axes['date'].keys())
width = years[1] - years[0] # Get width based on slices.
height = 1.0
# Get node histories for sigma.
histories = {}
if flist is not None:
nodes = flist
for node in nodes:
histories[node] = G.node_history(node, 'sigma')
if flist is not None:
these_nodes = flist # Use provided list of nodes.
else:
# Get only the topn most significant papers.
include = []
if sort_by == 'max':
if perslice: # Get topn per slice.
vals = {}
norm_by = 0.
# Organize values in a way that makes selection easier.
for node in nodes:
if max(histories[node].values()) == 0.:
continue
for year,val in histories[node].iteritems():
try:
vals[year][node] = val
except KeyError:
vals[year] = { node:val }
# Get the maximum values for each slice.
for year in vals.keys():
vals_ = numpy.array(vals[year].values())
indices = vals_.argsort()[-topn:][::-1]
include += [ vals[year].keys()[i] for i in indices ]
if numpy.max(vals_) > norm_by:
norm_by = numpy.max(vals_)
else: # Get topn overall.
maxes = numpy.array([ max(v.values()) for v
in histories.values() ])
indices = maxes.argsort()[-topn:][::-1]
include = [ histories.keys()[i] for i in indices ]
norm_by = numpy.max(maxes)
# Nodes to include.
these_nodes = [ node for node in nodes
if max(histories[node].values()) > 0
and node in include ]
if fig is None: # Create a new Figure instance.
fig = plt.figure(figsize=(10,len(these_nodes)/4.))
# Plot!
f = 1 # Current subplot.
axes = {}
x_min = min([min(v.keys()) for v in histories.values()])
for node in these_nodes:
x = sorted(histories[node].keys())
y = numpy.array([ histories[node][i] for i in x ])/norm_by
ax = fig.add_subplot(len(these_nodes),1,f)
f+=1
ax.set_yticks([])
ax.set_xbound(x_min, max(years)+1)
# Only show xticks on the bottom subplot.
if not f == len(these_nodes) + 1:
ax.set_xticklabels([])
# Block out years until first occurrence of feature.
rect = mpatches.Rectangle( (min(years),0), sorted(x)[0]-min(years),
height, fill=True, linewidth=0.0 )
rect.set_facecolor('black')
rect.set_alpha(0.1)
ax.add_patch(rect)
# Add a rectangle for each year, shaded according to burstness state.
for d in xrange(min(x), max(x)):
try: # May not have values for all years.
i = x.index(d)
except ValueError:
continue
xy = (d, 0.)
state = y[i]
rect = mpatches.Rectangle( xy, width, height, fill=True,
linewidth=0.0 )
rect.set_facecolor(color)
rect.set_alpha(state + 0.1)
ax.add_patch(rect)
ax.set_ylabel( G.node_index[node], rotation=0,
horizontalalignment='right',
verticalalignment='center' )
plt.subplots_adjust(left=0.5)
fig.tight_layout(h_pad=0.25)
return fig, G
[docs]def sigma(G, corpus, feature, **kwargs):
"""
Calculate sigma (from `Chen 2009 <http://arxiv.org/pdf/0904.1439.pdf>`_) for
all of the nodes in a :class:`.GraphCollection`\.
You can set parameters for burstness estimation using ``kwargs``:
========= ================================================================
Parameter Description
========= ================================================================
s Scaling parameter ( > 1.)that controls graininess of burst
detection. Lower values make the model more sensitive. Defaults
to 1.1.
gamma Parameter that controls the 'cost' of higher burst states.
Defaults to 1.0.
k Number of burst states. Defaults to 5.
========= ================================================================
Parameters
----------
G : :class:`.GraphCollection`
corpus : :class:`.Corpus`
feature : str
Name of a featureset in `corpus`.
Returns
-------
G : :class:`.GraphCollection`
A graph collection updated with ``sigma`` node attributes.
Examples
--------
Assuming that you have a :class:`.Corpus` generated from WoS data that has
been sliced by ``date``.
.. code-block:: python
>>> # Generate a co-citation graph collection.
>>> from tethne import GraphCollection
>>> kwargs = { 'threshold':2, 'topn':100 }
>>> G = GraphCollection()
>>> G.build(corpus, 'date', 'papers', 'cocitation', method_kwargs=kwargs)
>>> # Calculate sigma. This may take several minutes, depending on the
>>> # size of your co-citaiton graph collection.
>>> from tethne.analyze.corpus import sigma
>>> G = sigma(G, corpus, 'citations')
>>> # Visualize...
>>> from tethne.writers import collection
>>> collection.to_dxgmml(G, '~/cocitation.xgmml')
In the visualization below, node and label sizes are mapped to ``sigma``,
and border width is mapped to ``citations``.
.. figure:: _static/images/cocitation_sigma2.png
:width: 600
:align: center
"""
nodes = G.node_lookup.keys()
B = burstness(corpus, feature, flist=nodes, **kwargs)
B_ = {}
for key, values in B.iteritems():
dates,bursts = values
B_[key] = { dates[i]:bursts[i] for i in xrange(len(dates)) }
for key in sorted(G.graphs.keys()):
centrality = networkx.betweenness_centrality(G[key])
sigma = {}
for n in G[key].nodes():
n_ = G.node_index[n]
sigma[n] = ( ( centrality[n] + 1 ) ** B_[n_][key] ) - 1.
# Update graph.
networkx.set_node_attributes(G[key], 'sigma', sigma)
networkx.set_node_attributes(G[key], 'centrality', centrality)
return G