Source code for tethne.model.managers.mallet

"""
Classes and methods related to the :class:`.MALLETModelManager`\.
"""

import os
import re
import shutil
import tempfile
import subprocess
import numpy as np

from networkx import Graph

import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel('ERROR')

from ...classes import GraphCollection
#from ..social import TAPModel
from ..managers import ModelManager
from ...writers.corpora import to_documents
from ..corpus.ldamodel import from_mallet, LDAModel

[docs]class MALLETModelManager(ModelManager):
    """
    Generates a :class:`.LDAModel` from a :class:`.Corpus` using
    `MALLET <http://mallet.cs.umass.edu/>`_.
    
    The :class:`.Corpus` should already contain at least one featurset,
    indicated by the `feature` parameter, such as wordcounts. You may
    specify two working directories: `temppath` should be a working
    directory that will contain intermediate files (e.g. documents, data
    files, metadata), while `outpath` will contain the final model and any 
    plots generated during the modeling process. If `temppath` is not
    provided, generates and uses a system temporary directory.
    
    Tethne comes bundled with a recent version of MALLET. If you would
    rather use your own install, you can do so by providing the 
    `mallet_path` parameter. This should point to the directory containing
    ``/bin/mallet``.
    
    .. autosummary::
       :nosignatures:
       
       topic_over_time
    
    Parameters
    ----------
    D : :class:`.Corpus`
    feature : str
        Key from D.features containing wordcounts (or whatever
        you want to model with).
    outpath : str
        Path to output directory.
    temppath : str
        Path to temporary directory.
    mallet_path : str
        Path to MALLET install directory (contains bin/mallet).
        
    Examples
    --------
        
    Starting with some JSTOR DfR data (with wordcounts), a typical workflow
    might look something like this:
    
    .. code-block:: python
    
       >>> from nltk.corpus import stopwords                 #  1. Get stoplist.
       >>> stoplist = stopwords.words()
       
       >>> from tethne.readers import dfr                    #  2. Build Corpus.
       >>> C = dfr.corpus_from_dir('/path/to/DfR/datasets', 'uni', stoplist)
       
       >>> def filt(s, C, DC):                           # 3. Filter wordcounts.
       ...     if C > 3 and DC > 1 and len(s) > 3:
       ...         return True
       ...     return False
       >>> C.filter_features('wordcounts', 'wc_filtered', filt)
       
       >>> from tethne.model import MALLETModelManager       #   4. Get Manager.
       >>> outpath = '/path/to/my/working/directory'
       >>> mallet = '/Applications/mallet-2.0.7'
       >>> M = MALLETModelManager(C, 'wc_filtered', outpath, mallet_path=mallet)
       
       >>> M.prep()                                          #    5. Prep model.
       
       >>> model = M.build(Z=50, max_iter=300)               #   6. Build model.
       >>> model                                             # (may take awhile)
       <tethne.model.corpus.ldamodel.LDAModel at 0x10bfac710>

    A plot showing the log-likelihood/topic over modeling iterations should be
    generated in your `outpath`. For example:
    
    .. figure:: _static/images/ldamodel_LL.png
       :width: 400
       :align: center
       
    Behind the scenes, the :func:`.prep` procedure generates a plain-text corpus
    file at `temppath`, along with a metadata file. MALLET's ``import-file``
    procedure is then called, which translates the corpus into MALLET's internal
    format (also stored at the `temppath`).
    
    The :func:`.build` procedure then invokes MALLET's ``train-topics``
    procedure. This step may take a considerable amount of time, anywhere from 
    a few minutes (small corpus, few topics) to a few hours (large corpus, many
    topics).

    For a :class:`.Corpus` with a few thousand :class:`.Paper`\s, 300 - 500 
    iterations is often sufficient to achieve convergence for 20-100 topics.
    
    Once the :class:`.LDAModel` is built, you can access its methods directly.
    See full method descriptions in :class:`.LDAModel`\.
    
    For more information about topic modeling with MALLET see 
    `this tutorial <http://programminghistorian.org/lessons/topic-modeling-and-mallet>`_.
    
    """
    
    def __init__(self, D, feature='unigrams', outpath='/tmp/', temppath=None,
                          mallet_path='./model/bin/mallet-2.0.7'):
        super(MALLETModelManager, self).__init__(outpath, temppath)
        
        self.D = D
        self.mallet_path = mallet_path
        self.feature = feature
        
        self.input_path = '{0}/input.mallet'.format(self.temp)
        self.corpus_path = self.temp+'/tethne_docs.txt'
        self.meta_path = self.temp+'/tethne_meta.csv'
    
        self.dt = '{0}/dt.dat'.format(self.temp)
        self.wt = '{0}/wt.dat'.format(self.temp)
        self.om = '{0}/model.mallet'.format(self.outpath)
    
        self.vocabulary = self.D.features[self.feature]['index']

    def _generate_corpus(self, meta):
        """
        Writes a corpus to disk amenable to MALLET topic modeling.
        """
        
        # Metadata to export with corpus.
        metadata = ( meta, { p: { k:paper[k] for k in meta }
                       for p,paper in self.D.papers.iteritems() } )
        
        # Export the corpus.
        to_documents(
            self.temp+'/tethne',            # Temporary files.
            self.D.features[self.feature]['features'],
            metadata=metadata,
            vocab=self.D.features[self.feature]['index'] )
        
        self._export_corpus()
    
    def _export_corpus(self):
        """
        Calls MALLET's `import-file` method.
        """
        # bin/mallet import-file --input /Users/erickpeirson/mycorpus_docs.txt
        #     --output mytopic-input.mallet --keep-sequence --remove-stopwords
        
        self.mallet = self.mallet_path + "/bin/mallet"
        try:
            exit = subprocess.call( [ self.mallet, 
                    'import-file',
                    '--input {0}'.format(self.corpus_path),
                    '--output {0}'.format(self.input_path),
                    '--keep-sequence',          # Required (oddly) for LDA.
                    '--remove-stopwords' ])     # Probably redundant.

        except OSError:     # Raised if mallet_path is bad.
            raise OSError("MALLET path invalid or non-existent.")

        if exit != 0:
            raise RuntimeError("MALLET import-file failed: {0}.".format(exit))

    def _run_model(self, max_iter=20, **kwargs):
        """
        Calls MALLET's `train-topic` method.
        """
        #$ bin/mallet train-topics --input mytopic-input.mallet --num-topics 100 
        #> --output-doc-topics /Users/erickpeirson/doc_top 
        #> --word-topic-counts-file /Users/erickpeirson/word_top 
        #> --output-topic-keys /Users/erickpeirson/topic_keys
        
        prog = re.compile('\<([^\)]+)\>')
        ll_prog = re.compile(r'(\d+)')
        try:
            p = subprocess.Popen( [ self.mallet,
                        'train-topics',
                        '--input {0}'.format(self.input_path),
                        '--num-topics {0}'.format(self.Z),
                        '--num-iterations {0}'.format(max_iter),
                        '--output-doc-topics {0}'.format(self.dt),
                        '--word-topic-counts-file {0}'.format(self.wt),
                        '--output-model {0}'.format(self.om) ],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE)
        
            # Handle output of MALLET in real time.
            while p.poll() is None:
                l = p.stderr.readline()
                
                # Keep track of LL/topic.
                try:
                    this_ll = float(re.findall('([-+]\d+\.\d+)', l)[0])
                    self.ll.append(this_ll)
                except IndexError:  # Not every line will match.
                    pass
                
                # Keep track of modeling progress.
                try:
                    this_iter = float(prog.match(l).group(1))
                    self.ll_iters.append(this_iter)
                    progress = int(100 * this_iter/max_iter)
                    logger.debug('Modeling progress: {0}%.\r'.format( progress ),)
                except AttributeError:  # Not every line will match.
                    pass
            logger.debug('Modeling complete.')
            
        except OSError:     # Raised if mallet_path is bad.
            raise OSError("MALLET path invalid or non-existent.")
            
        self.num_iters += max_iter
            
    def _load_model(self):
        self.model = from_mallet(   self.dt, 
                                    self.wt, 
                                    self.meta_path  )

[docs]    def topic_over_time(self, k, threshold=0.05, mode='documents', 
                                 normed=True, plot=False, 
                                 figargs={'figsize':(10,10)} ):
        """
        Representation of topic ``k`` over 'date' slice axis.
        
        The :class:`.Corpus` used to initialize the :class:`.LDAModelManager`
        must have been already sliced by 'date'.
        
        Parameters
        ----------
        k : int
            Topic index.
        threshold : float
            Minimum representation of ``k`` in a document.
        mode : str
            'documents' counts the number documents that contain ``k``;
            'proportions' sums the representation of ``k`` in each document
            that contains it.
        normed : bool
            (default: True) Normalizes values by the number of documents in each
            slice.
        plot : bool
            (default: False) If True, generates a MatPlotLib figure and saves
            it to the :class:`MALLETModelManager` outpath.
        figargs : dict
            kwargs dict for :func:`matplotlib.pyplot.figure`\.
            
        Returns
        -------
        keys : array
            Keys into 'date' slice axis.
        R : array
            Representation of topic ``k`` over time.
            
        Examples
        --------
        
        .. code-block:: python
        
           >>> keys, repr = M.topic_over_time(1, plot=True)

        ...should return ``keys`` (date) and ``repr`` (% documents) for topic 1,
        and generate a plot like this one in your ``outpath``.
        
        .. figure:: _static/images/topic_1_over_time.png
           :width: 400
           :align: center
           
        """
        
        if k >= self.model.Z:
            raise ValueError('No such topic in this model.')
        
        items = self.model.dimension_items(k, threshold)
        slices = self.D.get_slices('date')
        keys = sorted(slices.keys())

        R = []

        topic_label = self.model.print_topic(k)

        if mode == 'documents': # Documents that contain k.
            for t in keys:
                docs = slices[t]
                Ndocs = float(len(docs))
                Ncontains = 0.
                for i,w in items:
                    if i in docs:
                        Ncontains += 1.
                if normed:  # As a percentage of docs in each slice.
                    ylabel = 'Percentage of documents containing topic.'
                    if Ndocs > 0.:
                        R.append( Ncontains/Ndocs )
                    else:
                        R.append( 0. )
                else:       # Raw count.
                    ylabel = 'Number of documents containing topic.'                
                    R.append( Ncontains )

        elif mode == 'proportions': # Representation of topic k.
            for t in keys:
                docs = slices[t]
                Ndocs = float(len(docs))
                if normed:      # Normalized by number of docs in each slice.
                    ylabel = 'Normed representation of topic in documents.'                
                    if Ndocs > 0.:
                        R.append( sum([ w for i,w in items if i in docs ])
                                                                        /Ndocs )
                    else:
                        R.append( 0. )
                else:
                    ylabel = 'Sum of topic representation in documents.'                
                    R.append( sum([ w for i,w in items if i in docs ]) )
        
        if plot:    # Generates a simple lineplot and saves it in the outpath.
            import matplotlib.pyplot as plt
            fig = plt.figure(**figargs)
            plt.plot(np.array(keys), np.array(R))
            plt.xlabel('Time Slice')
            plt.ylabel(ylabel)      # Set based on mode.
            plt.title(topic_label)
            plt.savefig('{0}/topic_{1}_over_time.png'.format(self.outpath, k))        
        
        return np.array(keys), np.array(R)