Source code for tethne.model.managers.dtm

"""
Classes and methods related to the :class:`.DTMModelManager`\.
"""

import os
import re
import shutil
import tempfile
import subprocess
import numpy as np

from networkx import Graph

import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel('ERROR')

from ...classes import GraphCollection
from ..social import TAPModel
from ..managers import ModelManager
from ..corpus.dtmmodel import from_gerrish

[docs]class DTMModelManager(ModelManager):
    """
    Generates a :class:`.DTMModel` from a :class:`.Corpus` using 
    `Gerrish's C++ implementation <http://code.google.com/p/princeton-statistical-learning/downloads/detail?name=dtm_release-0.8.tgz>`_.

    You should be sure to slice your :class:`.Corpus` by 'date' using the 
    'time_period' method (for details, see :meth:`.Corpus.slice`\).
    
    .. autosummary::
       :nosignatures:
       
       plot_topic_evolution
       topic_over_time
       
    
    Parameters
    ----------
    D : :class:`.Corpus`
    outpath : str
        Path to output directory.
    dtm_path : str
        Path to MALLET install directory (contains bin/mallet).
        
    Examples
    --------
    
    Starting with some JSTOR DfR data (with wordcounts), a typical workflow
    might look something like this:
    
    .. code-block:: python
    
       >>> from nltk.corpus import stopwords                 #  1. Get stoplist.
       >>> stoplist = stopwords.words()
       
       >>> from tethne.readers import dfr                    #  2. Build Corpus.
       >>> C = dfr.corpus_from_dir('/path/to/DfR/datasets', 'uni', stoplist)
       
       >>> def filt(s, C, DC):                           # 3. Filter wordcounts.
       ...     if C > 3 and DC > 1 and len(s) > 3:
       ...         return True
       ...     return False
       >>> C.filter_features('wordcounts', 'wc_filtered', filt)
       
       >>> C.slice('date', 'time_period', window_size=5)     #  4. Slice Corpus.
       
       >>> from tethne.model import DTMModelManager          #   5. Get Manager.
       >>> outpath = '/path/to/my/working/directory'
       >>> dtm = '/path/to/dtm/bin/main'
       >>> M = DTMModelManager(C, 'wc_filtered', outpath, dtm_path=dtm)
       
       >>> M.prep()                                          #    6. Prep model.
       
       >>> model = M.build(Z=50)                             #   7. Build model.
       >>> model                                             # (may take awhile)
       <tethne.model.corpus.dtmmodel.DTMModel at 0x10bfac710>
       
    A plot showing the log-likelihood/topic over modeling iterations should be
    generated in your `outpath`. For example:
    
    .. figure:: _static/images/dtmmodel_LL.png
       :width: 400
       :align: center
    
    Behind the scenes, the :func:`.prep` procedure generates data files at
    ``temppath`` describing your :class:`.Corpus`\:
    
    * ``tethne-vocab.dat`` contains all of the words in the corpus, one per
      line.
    * ``tethne-mult.dat`` contains wordcounts for each document; words are
      represented by integer indices corresponding to line numbers in 
      ``tethne-vocab.dat``. Documents are ordered by publication date (earliest
      to latest).
    * ``tethne-seq.dat`` describes how documents are to be apportioned among
      time-periods. The first line is the number of time periods, and the 
      subsequent lines specify the number of documents in each successive
      time-period.
    * ``tethne-meta.dat`` is a tab-delimted metadata file. Those records occur 
      in the same order as in the documents in ``tethne-mult.dat``. For
      example::
      
       id	date	atitle
       10.2307/2437162	1945	SOME ECOTYPIC RELATIONS OF DESCHAMPSIA CAESPITOSA
       10.2307/4353229	1940	ENVIRONMENTAL INFLUENCE AND TRANSPLANT EXPERIMENTS
       10.2307/4353158	1937	SOME FUNDAMENTAL PROBLEMS OF TAXONOMY AND PHYLOGENETICS

    The :func:`.build` procedure then starts the DTM modeling algorithm. This 
    step may take a considerable amount of time, anywhere from a few minutes 
    (small corpus, few topics) to a few hours (large corpus, many topics).
    **Warning:** this implementation of DTM is known to run into memory issues
    with large vocabularies. If a memory-leak does occur, try using a more
    restrictive filter to the featureset, using 
    :func:`.Corpus.filter_features`\.
    
    Once the :class:`.DTMModel` is built, you can access its methods directly.
    See full method descriptions in :class:`.DTMModel`\. Of special interest 
    are:
    
    
    .. currentmodule:: tethne.model.corpus.dtmmodel
    
    .. autosummary::
       :nosignatures:
    
       DTMModel.list_topic_diachronic
       DTMModel.print_topic_diachronic
       DTMModel.topic_evolution
    
    To plot the evolution of a topic over time, use 
    :func:`.plot_topic_evolution`\. 
    
    .. code-block:: python
    
       >>> M.plot_topic_evolution(2, plot=True)
       
    ...should generate a plot at ``outpath`` called ``topic_2_evolution.png``:

    .. figure:: _static/images/topic_2_evolution.png
       :width: 400
       :align: center
    """
    
    def __init__(self, D, feature='unigrams', outpath='/tmp',
                          temppath=None, dtm_path='./bin/main'):
        """
        """
        super(DTMModelManager, self).__init__(outpath, temppath)
        
        self.D = D
        self.dtm_path = dtm_path
        
        self.feature = feature
        self.outname = '{0}/model_run'.format(self.outpath)

        self.mult_path = '{0}/tethne-mult.dat'.format(self.temp)
        self.seq_path = '{0}/tethne-seq.dat'.format(self.temp)        
        self.vocab_path = '{0}/tethne-vocab.dat'.format(self.temp)        
        self.meta_path = '{0}/tethne-meta.dat'.format(self.temp)
    
    def _generate_corpus(self, meta):
        from tethne.writers.corpora import to_dtm_input    
        
        to_dtm_input(self.temp+'/tethne', self.D, self.feature, fields=meta)
    
    def _run_model(self, **kwargs):
        ## Run the dynamic topic model.
        #./main \
        #  --ntopics=20 \
        #  --mode=fit \
        #  --rng_seed=0 \
        #  --initialize_lda=true \
        #  --corpus_prefix=example/test \
        #  --outname=example/model_run \
        #  --top_chain_var=0.005 \
        #  --alpha=0.01 \
        #  --lda_sequence_min_iter=6 \
        #  --lda_sequence_max_iter=20 \
        #  --lda_max_em_iter=10
        
        top_chain_var = kwargs.get('top_chain_var', 0.005)
        lda_seq_min_iter = kwargs.get('lda_seq_min_iter', 6)
        lda_seq_max_iter = kwargs.get('lda_seq_max_iter', 20)
        lda_max_em_iter = kwargs.get('lda_max_em_iter', 10)  
        alpha = kwargs.get('alpha', 0.01) 
        
        max_v = lda_seq_min_iter*lda_max_em_iter*len(self.D.get_slices('date'))
        
        self.conv = []
        i = 1

        corpus_prefix = '{0}/tethne'.format(self.temp)
        
        FNULL = open(os.devnull, 'w')
        
        p = subprocess.Popen( [ self.dtm_path,
                    '--ntopics={0}'.format(self.Z),
                    '--mode=fit',
                    '--rng_seed=0',
                    '--initialize_lda=true',
                    '--corpus_prefix={0}'.format(corpus_prefix),
                    '--outname={0}'.format(self.outname),
                    '--top_chain_var={0}'.format(top_chain_var),
                    '--alpha={0}'.format(alpha),
                    '--lda_sequence_min_iter={0}'.format(lda_seq_min_iter),
                    '--lda_sequence_max_iter={0}'.format(lda_seq_max_iter),
                    '--lda_max_em_iter={0}'.format(lda_max_em_iter) ],
                stderr=subprocess.PIPE,
                stdout=FNULL)
        
        while p.poll() is None:
            l = p.stderr.readline()
            try:    # Find the LL
                this_ll = float(re.findall(r'^lhood\s+=\s+([-]?\d+\.\d+)', l)[0])
                self.ll.append(this_ll)
                
                self.ll_iters.append(i)
                i += 1
            except IndexError:
                pass
            
            try:    # Find conv
                conv = re.findall(r'conv\s+=\s+([-]?\d+\.\d+e[-]\d+)', l)
                self.conv.append(float(conv[0]))

                progress = int(100 * float(len(self.conv))/float(max_v))

            except IndexError:
                pass
    
        self.num_iters += lda_max_em_iter   # TODO: does this make sense?
            
    def _load_model(self):
        """Load and return a :class:`.DTMModel`\."""
        
        self.model = from_gerrish(self.outname, self.meta_path, self.vocab_path)
        self.vocabulary = self.model.vocabulary
        return self.model

[docs]    def plot_topic_evolution(self, k, Nwords=5, plot=False,
                                      figargs={'figsize':(10,10)} ):
        """
        Plot the probability of the top ``Nwords`` words in topic ``k`` over
        time.
        
        If ``plot`` is True, generates a plot image at ``outpath``.
        
        TODO: should return a Figure object.
           
        Parameters
        ----------
        k : int
            Topic index.
        Nwords : int
            Number of words to include in plot.
        plot : bool
            (default: False) If True, generates a plot image at ``outpath``.
        figargs : dict
            Keyword arguments to pass to :func:`matplotlib.pyplot.plot`\.
        
        Returns
        -------
        keys : list
            Start-date of each time-period.
        t_series : list
            Array of p(w|t) for Nwords for each time-period.
            
        Examples
        --------

        .. code-block:: python
        
           >>> M.plot_topic_evolution(2, plot=True)
           
        ...should generate a plot at ``outpath`` called 
        ``topic_2_evolution.png``:

        .. figure:: _static/images/topic_2_evolution.png
           :width: 400
           :align: center
        """
        
        t_keys, t_series = self.model.topic_evolution(k, Nwords)
        
        slices = self.D.get_slices('date')
        keys = sorted(slices.keys())
        
        if plot:
            import matplotlib.pyplot as plt
            fig = plt.figure(**figargs)
            ax = fig.add_axes([0.1, 0.1, 0.6, 0.75])
            for word, series in t_series.iteritems():
                plt.plot(keys, series, label=word)
            plt.xlabel('Time Slice')
            plt.ylabel('Probability of word in topic')
            plt.title('Evolution of topic {0}'.format(k))
            plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
            plt.xlim(keys[0], keys[-1])
            plt.savefig('{0}/topic_{1}_evolution.png'.format(self.outpath, k))
    
        return keys, t_series

[docs]    def topic_over_time(self, k, threshold=0.05, mode='documents', 
                                 normed=True, plot=False, 
                                 figargs={'figsize':(10,10)} ):
        """
        Representation of topic ``k`` over 'date' slice axis.
        
        The :class:`.Corpus` used to initialize the :class:`.DTMModelManager`
        must have been already sliced by 'date'.
        
        Parameters
        ----------
        k : int
            Topic index.
        threshold : float
            Minimum representation of ``k`` in a document.
        mode : str
            'documents' counts the number documents that contain ``k``;
            'proportions' sums the representation of ``k`` in each document
            that contains it.
        normed : bool
            (default: True) Normalizes values by the number of documents in each
            slice.
        plot : bool
            (default: False) If True, generates a MatPlotLib figure and saves
            it to the :class:`MALLETModelManager` outpath.
        figargs : dict
            kwargs dict for :func:`matplotlib.pyplot.figure`\.
            
        Returns
        -------
        keys : array
            Keys into 'date' slice axis.
        R : array
            Representation of topic ``k`` over time.
            
        Examples
        --------
        
        .. code-block:: python
        
           >>> keys, repr = M.topic_over_time(1, plot=True)

        ...should return ``keys`` (date) and ``repr`` (% documents) for topic 1,
        and generate a plot like this one in your ``outpath``.
        
        .. figure:: _static/images/topic_1_over_time.png
           :width: 400
           :align: center
        """
        
        if k >= self.model.Z:
            raise ValueError('No such topic in this model.')
        
        items = self.model.dimension_items(k, threshold)
        slices = self.D.get_slices('date')
        keys = sorted(slices.keys())

        R = []

        topic_label = self.model.print_topic(k,0)

        if mode == 'documents': # Documents that contain k.
            for t in keys:
                docs = slices[t]
                Ndocs = float(len(docs))
                Ncontains = 0.
                for i,w in items:
                    if i in docs:
                        Ncontains += 1.
                if normed:  # As a percentage of docs in each slice.
                    ylabel = 'Percentage of documents containing topic.'
                    if Ndocs > 0.:
                        R.append( Ncontains/Ndocs )
                    else:
                        R.append( 0. )
                else:       # Raw count.
                    ylabel = 'Number of documents containing topic.'                
                    R.append( Ncontains )

        elif mode == 'proportions': # Representation of topic k.
            for t in keys:
                docs = slices[t]
                Ndocs = float(len(docs))
                if normed:      # Normalized by number of docs in each slice.
                    ylabel = 'Normed representation of topic in documents.'                
                    if Ndocs > 0.:
                        R.append( sum([ w for i,w in items if i in docs ])
                                                                        /Ndocs )
                    else:
                        R.append( 0. )
                else:
                    ylabel = 'Sum of topic representation in documents.'                
                    R.append( sum([ w for i,w in items if i in docs ]) )
        
        if plot:    # Generates a simple lineplot and saves it in the outpath.
            import matplotlib.pyplot as plt
            fig = plt.figure(**figargs)
            plt.plot(np.array(keys), np.array(R))
            plt.xlabel('Time Slice')
            plt.ylabel(ylabel)      # Set based on mode.
            plt.title(topic_label)
            plt.savefig('{0}/topic_{1}_over_time.png'.format(self.outpath, k))        
        
        return np.array(keys), np.array(R)