"""
Methods for analyzing :class:`.Corpus` objects.
.. autosummary::
:nosignatures:
burstness
feature_burstness
sigma
"""
import networkx as nx
import warnings
from math import exp, log
from collections import defaultdict
from itertools import izip
from tethne.utilities import argmin, mean
import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
unicode = str
xrange = range
def _forward(X, s=1.1, gamma=1., k=5):
"""
Forward dynamic algorithm for burstness automaton HMM, from `Kleinberg
(2002) <http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_.
Parameters
----------
X : list
A series of time-gaps between events.
s : float
(default: 1.1) Scaling parameter ( > 1.)that controls graininess of
burst detection. Lower values make the model more sensitive.
gamma : float
(default: 1.0) Parameter that controls the 'cost' of higher burst
states. Higher values make it more 'difficult' to achieve a higher
burst state.
k : int
(default: 5) Number of states. Higher values increase computational
cost of the algorithm. A maximum of 25 is suggested by the literature.
Returns
-------
states : list
Optimal state sequence.
"""
X = list(X)
def alpha(i):
return (n/T)*(s**i)
def tau(i, j):
if j > i:
return (j-i)*gamma*log(n)
return 0.
def f(j, x):
return alpha(j) * exp(-1. * alpha(j) * x)
def C(j, t):
if j == 0 and t == 0:
return 0.
elif t == 0:
return float("inf")
C_tau = min([C_values[l][t-1] + tau(l, j) for l in xrange(k)])
return (-1. * log(f(j,X[t]))) + C_tau
T = sum(X)
n = len(X)
# C() requires default (0) values, so we construct the "array" in advance.
C_values = [[0 for t in xrange(len(X))] for j in xrange(k)]
for j in xrange(k):
for t in xrange(len(X)):
C_values[j][t] = C(j,t)
# Find the optimal state sequence.
states = [argmin([c[t] for c in C_values]) for t in xrange(n)]
return states
def _top_features(corpus, feature, topn=20, perslice=False, axis='date'):
warnings.warn("Removed in 0.8. Use corpus.top_features() instead.",
DeprecationWarning)
return corpus.top_features(feature, topn=topn, perslice=perslice)
[docs]def burstness(corpus, featureset_name, features=[], k=5, topn=20,
perslice=False, normalize=True, **kwargs):
"""
Estimate burstness profile for the ``topn`` features (or ``flist``) in
``feature``.
Uses the popular burstness automaton model inroduced by `Kleinberg (2002)
<http://www.cs.cornell.edu/home/kleinber/bhs.pdf>`_.
Parameters
----------
corpus : :class:`.Corpus`
feature : str
Name of featureset in ``corpus``. E.g. ``'citations'``.
k : int
(default: 5) Number of burst states.
topn : int or float {0.-1.}
(default: 20) Number (int) or percentage (float) of top-occurring
features to return. If ``flist`` is provided, this parameter is ignored.
perslice : bool
(default: False) If True, loads ``topn`` features per slice. Otherwise,
loads ``topn`` features overall. If ``flist`` is provided, this
parameter is ignored.
flist : list
List of features. If provided, ``topn`` and ``perslice`` are ignored.
normalize : bool
(default: True) If True, burstness is expressed relative to the hightest
possible state (``k-1``). Otherwise, states themselves are returned.
kwargs : kwargs
Parameters for burstness automaton HMM.
Returns
-------
B : dict
Keys are features, values are tuples of ( dates, burstness )
Examples
--------
.. code-block:: python
>>> from tethne.analyze.corpus import burstness
>>> B = burstness(corpus, 'abstractTerms', flist=['process', 'method']
>>> B['process']
([1990, 1991, 1992, 1993], [0., 0.4, 0.6, 0.])
"""
# If `features` of interest are not specified, calculate burstness for the
# top `topn` features.
if len(features) == 0:
T = corpus.top_features(featureset_name, topn=topn, perslice=perslice)
if perslice:
features = list(set([f[0] for flist in zip(*T)[1] for f in flist]))
else:
features = list(zip(*T))[0]
B = {feature: feature_burstness(corpus, featureset_name, feature, k=k,
normalize=normalize, **kwargs)
for feature in features}
return B
[docs]def feature_burstness(corpus, featureset_name, feature, k=5, normalize=True,
s=1.1, gamma=1., **slice_kwargs):
"""
Estimate burstness profile for a feature over the ``'date'`` axis.
Parameters
----------
corpus : :class:`.Corpus`
feature : str
Name of featureset in ``corpus``. E.g. ``'citations'``.
findex : int
Index of ``feature`` in ``corpus``.
k : int
(default: 5) Number of burst states.
normalize : bool
(default: True) If True, burstness is expressed relative to the hightest
possible state (``k-1``). Otherwise, states themselves are returned.
kwargs : kwargs
Parameters for burstness automaton HMM.
"""
if featureset_name not in corpus.features:
corpus.index_feature(featureset_name)
if 'date' not in corpus.indices:
corpus.index('date')
# Get time-intervals between occurrences.
dates = [min(corpus.indices['date'].keys()) - 1] # Pad start.
X_ = [1.]
years, values = corpus.feature_distribution(featureset_name, feature)
for year, N in izip(years, values):
if N == 0:
continue
if N > 1:
if year == dates[-1] + 1:
for n in xrange(int(N)):
X_.append(1./N)
dates.append(year)
else:
X_.append(float(year - dates[-1]))
dates.append(year)
for n in xrange(int(N) - 1):
X_.append(1./(N - 1))
dates.append(year)
else:
X_.append(float(year - dates[-1]))
dates.append(year)
# Get optimum state sequence.
st = _forward(map(lambda x: x*100, X_), s=s, gamma=gamma, k=k)
# Bin by date.
A = defaultdict(list)
for i in xrange(len(X_)):
A[dates[i]].append(st[i])
# Normalize.
if normalize:
A = {key: mean(values)/k for key, values in A.items()}
else:
A = {key: mean(values) for key, values in A.items()}
D = sorted(A.keys())
return D[1:], [A[d] for d in D[1:]]
[docs]def sigma(G, corpus, featureset_name, B=None, **kwargs):
"""
Calculate sigma (from `Chen 2009 <http://arxiv.org/pdf/0904.1439.pdf>`_)
for all of the nodes in a :class:`.GraphCollection`\.
You can set parameters for burstness estimation using ``kwargs``:
========= ===============================================================
Parameter Description
========= ===============================================================
s Scaling parameter ( > 1.)that controls graininess of burst
detection. Lower values make the model more sensitive. Defaults
to 1.1.
gamma Parameter that controls the 'cost' of higher burst states.
Defaults to 1.0.
k Number of burst states. Defaults to 5.
========= ===============================================================
Parameters
----------
G : :class:`.GraphCollection`
corpus : :class:`.Corpus`
feature : str
Name of a featureset in `corpus`.
Examples
--------
Assuming that you have a :class:`.Corpus` generated from WoS data that has
been sliced by ``date``.
.. code-block:: python
>>> # Generate a co-citation graph collection.
>>> from tethne import GraphCollection
>>> kwargs = { 'threshold':2, 'topn':100 }
>>> G = GraphCollection()
>>> G.build(corpus, 'date', 'papers', 'cocitation', method_kwargs=kwargs)
>>> # Calculate sigma. This may take several minutes, depending on the
>>> # size of your co-citaiton graph collection.
>>> from tethne.analyze.corpus import sigma
>>> G = sigma(G, corpus, 'citations')
>>> # Visualize...
>>> from tethne.writers import collection
>>> collection.to_dxgmml(G, '~/cocitation.xgmml')
In the visualization below, node and label sizes are mapped to ``sigma``,
and border width is mapped to ``citations``.
.. figure:: _static/images/cocitation_sigma2.png
:width: 600
:align: center
"""
if 'date' not in corpus.indices:
corpus.index('date')
# Calculate burstness if not provided.
if not B:
B = burstness(corpus, featureset_name, features=G.nodes(), **kwargs)
Sigma = {} # Keys are dates (from GraphCollection), values are
# node:sigma dicts.
for key, graph in G.iteritems():
centrality = nx.betweenness_centrality(graph)
sigma = {} # Sigma values for all features in this year.
attrs = {} # Sigma values for only those features in this graph.
for n_, burst in B.iteritems():
burst = dict(list(zip(*burst))) # Reorganize for easier lookup.
# Nodes are indexed as integers in the GraphCollection.
n = G.node_lookup[n_]
# We have burstness values for years in which the feature ``n``
# occurs, and we have centrality values for years in which ``n``
# made it into the graph.
if n in graph.nodes() and key in burst:
sigma[n] = ((centrality[n] + 1.) ** burst[key]) - 1.
attrs[n] = sigma[n]
# Update graph with sigma values.
nx.set_node_attributes(graph, 'sigma', attrs)
Sigma[key] = sigma
# Invert results and update the GraphCollection.master_graph.
# TODO: is there a more efficient way to do this?
inverse = defaultdict(dict)
for gname, result in Sigma.iteritems():
if hasattr(result, '__iter__'):
for n, val in result.iteritems():
inverse[n].update({gname: val})
nx.set_node_attributes(G.master_graph, 'sigma', inverse)
# We want to return results in the same format as burstness(); with node
# labels as keys; values are tuples ([years...], [sigma...]).
return {n: list(zip(*G.node_history(G.node_lookup[n], 'sigma').items()))
for n in B.keys()}