Source code for tethne.model.corpus.ldamodel

"""
Classes and methods related to the :class:`.LDAModel`\.
"""

from ..basemodel import BaseModel
import numpy as np

import csv
from scipy.sparse import coo_matrix

[docs]class LDAModel(BaseModel):
    """
    Represents a Latent Dirichlet Allocation (LDA) topic model.
    
    In the LDA model, topics (dimensions) are probability distributions over 
    words (features), and documents (items) are comprised of mixtures of topics.
    For a complete description of the model, see 
    `Blei & Jordan (2003) <http://jmlr.org/papers/v3/blei03a.html>`_.
    
    To generate a :class:`.LDAModel` from a :class:`.Corpus` using
    `MALLET <http://mallet.cs.umass.edu/>`_, use the 
    :class:`.MALLETModelManager`\. Additional managers for :class:`.LDAModel`\s
    will be added shortly.
    
    You can also initialize a :class:`.LDAModel` directly by providing the
    following parameters:
    
    * ``theta``, describes the proportion of topics (cols) in each document 
      (rows).
    * ``phi`` describes the topic (rows) distributions over words (cols).
    * ``metadata`` should map matrix indices for documents onto :class:`.Paper`
      IDs (or whatever you use to identify documents).
    * ``vocabulary`` should map matrix indices for words onto word-strings.
    ``metadata`` and ``vocabulary`` mappings.
    
    Finally, you can use :func:`.from_mallet` to generate a :class:`.LDAModel`
    from MALLET output.
    
    .. autosummary::
       :nosignatures:
       
       list_topic
       list_topics
       print_topic
       print_topics
    
    Parameters
    ----------
    theta : matrix-like
        Distribution of topics (cols) in documents (rows). Rows sum to 1.
    phi : matrix-like
        Distribution over words (cols) for topics (rows). Rows sum to 1.
    metadata : dict
        Maps matrix indices onto document datadata.
    vocabulary : dict
        Maps W indices onto words.
    """

    def __init__(self, theta, phi, metadata, vocabulary):
        """
        Initialize the :class:`.LDAModel`\.
        """
        
        self.theta = theta
        self.M = theta.shape[0] # Number of documents.

        self.phi = phi
        self.Z = phi.shape[0]   # Number of topics.
        self.W = phi.shape[1]   # Number of terms.

        self.metadata = metadata
        self.vocabulary = vocabulary
        
        self.lookup = { v['id']:k for k,v in metadata.iteritems() }
    
    # Obligatory methods.
    def _item_description(self, i, **kwargs):
        """
        Yields proportion of each topic in document.
        """

        return [ (t, self.theta[i, t]) for t in xrange(self.theta.shape[1] ) ]
    
    def _item_relationship(self, i, j, **kwargs):
        """
        Yields the relationship between two documents.
        """
        # TODO: implement cosine-similarity or another similarity metric here.

        return None
    
    def _dimension_description(self, k, **kwargs):
        """
        Yields probability distribution over terms.
        """
        return [ (w, self.phi[k, w]) for w in xrange(self.phi.shape[1]) ]
    
    def _dimension_relationship(self, k, e, **kwargs):
        """
        Simply returns (k,e); there is no additional information about
        dimensions.
        """
        # TODO: imlement a similarity metric for topics.

        return None
        
    def _dimension_items(self, k, threshold, **kwargs):
        """
        Returns items that contain ``k`` at or above ``threshold``.
        
        Parameters
        ----------
        k : int
            Topic index.
        threshold : float
            Minimum representation of ``k`` in document.
            
        Returns
        -------
        description : list
            A list of ( item, weight ) tuples.
        """

        description = [ (self.metadata[i]['id'], self.theta[i, k])
                            for i in xrange(self.theta[:, k].size)
                            if self.theta[i, k] >= threshold ]
        return description
    
[docs]    def list_topic(self, k, Nwords=10):
        """
        Yields a list of the top ``Nwords`` for topic ``k``.
        
        Parameters
        ----------
        k : int
            A topic index.
        Nwords : int
            Number of words to return.
        
        Returns
        -------
        as_list : list
            List of words in topic.

        Examples
        --------
        
        .. code-block:: python
        
           >>> model.list_topic(1, Nwords=5)
           [ 'opposed', 'terminates', 'trichinosis', 'cistus', 'acaule' ]
           
        """
        words = self.dimension(k, top=Nwords)
        as_list = [ self.vocabulary[w] for w,p in words ]

        return as_list
    
[docs]    def print_topic(self, k, Nwords=10):
        """
        Yields the top ``Nwords`` for topic ``k`` as a string.
        
        Parameters
        ----------
        k : int
            A topic index.
        Nwords : int
            Number of words to return.
        
        Returns
        -------
        as_string : str
            Joined list of words in topic.
            
        Examples
        --------
        
        .. code-block:: python
        
           >>> model.print_topic(1, Nwords=5)
           'opposed, terminates, trichinosis, cistus, acaule'
        """

        as_string = ', '.join(self.list_topic(k, Nwords))
    
        return as_string
    
[docs]    def list_topics(self, Nwords=10):
        """
        Yields lists of the top ``Nwords`` for each topic.
        
        Parameters
        ----------
        Nwords : int
            Number of words to return for each topic.
        
        Returns
        -------
        as_dict : dict
            Keys are topic indices, values are list of words.
        """
        
        as_dict = {}
        for k in xrange(self.Z):
            as_dict[k] = self.list_topic(k, Nwords)
    
        return as_dict
    
[docs]    def print_topics(self, Nwords=10):
        """
        Yields the top ``Nwords`` for each topic, as a string.
        
        Parameters
        ----------
        Nwords : int
            Number of words to return for each topic.
        
        Returns
        -------
        as_string : str
            Newline-delimited lists of words for each topic.
        """
            
        as_dict = self.list_topics(Nwords)
        s = []
        for key, value in as_dict.iteritems():
            s.append('{0}: {1}'.format(key, ', '.join(value)))
        as_string = '\n'.join(s)
        
        return as_string

[docs]def from_mallet(top_doc, word_top, metadata):
    """
    Generate a :class:`.LDAModel` from MALLET output.

    MALLET's LDA topic modeling algorithm produces multiple output files. See
    the `MALLET documentation <http://mallet.cs.umass.edu/topics.php>`_ for
    details. When invoking MALLET's ``train-topics`` procedure, you should
    have provided the ``--output-doc-topics`` and ``--word-topic-counts-file``
    parameters; the ``top_doc`` and ``word_top`` parameters should be paths
    to those two files.
    
    You should also provide the path ``metadata`` to a tab-separated file 
    containing metadata about the documents used to build the model. The first
    column should be the ID used in the original corpus files. For example::
    
       10.2307/1709733 1962	BOTANICAL CLASSIFICATION	SCIENCE
       10.2307/20000814	1974	THE USE OF DIFFERENTIAL SYSTEMATICS IN GEOGRAPHIC RESEARCH	AREA

    Parameters
    ----------
    top_doc : string
        Path to topic-document datafile generated with ``--output-doc-topics``.
    word_top : string
        Path to word-topic datafile generated with ``--word-topic-counts-file``.
    metadata : string
        Path to tab-separated metadata file with :class:`.Paper` keys.

    Returns
    -------
    ldamodel : :class:`.LDAModel`

    """

    loader = MALLETLoader(top_doc, word_top, metadata)
    model = loader.load()

    return model

[docs]class MALLETLoader(object):
    """
    Used by :func:`.from_mallet` to load MALLET output.
    """
    def __init__(self, top_doc, word_top, metapath):
        """
        Parameters
        ----------
        top_doc : string
            Path to topic-document datafile generated with 
            ``--output-doc-topics``.
        word_top : string
            Path to word-topic datafile generated with 
            ``--word-topic-counts-file``.
        metadata : string
            Path to tab-separated metadata file with :class:`.Paper` keys.
        """
        self.top_doc = top_doc
        self.word_top = word_top
        self.metapath = metapath
    
[docs]    def load(self):
        """
        Load a :class:`.LDAModel` from MALLET output.
        
        Returns
        -------
        self.model : :class:`.LDAModel`
        """
        self._handle_top_doc()
        self._handle_metadata()
        self._handle_word_top()
    
        self.model = LDAModel(self.theta, self.phi, self.metadata, self.vocabulary)
        return self.model

    def _handle_top_doc(self):
        """
        Used by :func:`.from_mallet` to reconstruct theta posterior
        distributions.
        
        Returns
        -------
        td : Numpy array
            Rows are documents, columns are topics. Rows sum to ~1.
        """

        path = self.top_doc

        D = []
        T = []
        P = []
        
        doc_index = {}

        with open(path, "rb") as f:
            i = -1
            reader = csv.reader(f, delimiter='\t')
            for line in reader:
                i += 1
                if i == 0: continue     # Avoid header row.
                
                d = int(line[0])
                id = str(line[1])
                doc_index[d] = id

                t = line[2:]
                tops = []
                for i in xrange(0,len(t)-1,2):
                    tops.append( (int(t[i]), float(t[i+1])) )
            
                for k,p in tops:
                    D.append(d)     # Document indices.
                    T.append(k)     # Topic indices.
                    P.append(p)     # Proportions.

            M = len(set(D)) # Number of documents.
            K = len(set(T)) # Number of topics.
            
        self.theta = coo_matrix((P, (D,T)), shape=(M,K)).todense()
        self.doc_index = doc_index

        return self.theta, self.doc_index
        
    def _handle_word_top(self):
        """
        Used by :func:`.from_mallet` to reconstruct phi posterior distributions.
        
        Returns
        -------
        wt : Numpy array
            Rows are topics, columns are words. Rows sum to ~1.
        """
        path = self.word_top

        vocabulary = {}

        W = []
        T = []
        C = []
        
        with open(path, "r") as f:
            reader = csv.reader(f, delimiter=' ')
            for line in reader:
                w = int(line[0])
                term = str(line[1])
                vocabulary[w] = term
                for l in line[2:]:
                    k,c = l.split(':')
                    W.append(w)
                    T.append(int(k))
                    C.append(float(c))

        K = max(T) + 1
        V = len(set(W))
        
        phi = coo_matrix((C, (T,W)), shape=(K,V)).todense()

        # Normalize
        for k in xrange(K):
            phi[k,:] /= np.sum(phi[k,:])    
        
        self.phi = phi
        self.vocabulary = vocabulary
        
        return phi, vocabulary

    def _handle_metadata(self):
        """
        Used by :func:`.from_mallet` to read metadata file.    
        
        Returns
        -------
        md : dict
            Keys are document indices, values are identifiers from a 
            :class:`.Paper` property.
        """
        path = self.metapath

        lookup = { v:k for k,v in self.doc_index.iteritems() }
        
        md = {}

        with open(path, "rU") as f:
            reader = csv.reader(f, delimiter='\t')
            all_lines = [ l for l in reader ]
            keys = all_lines[0]
            lines = all_lines[1:]
            for l in lines:
                md[lookup[l[0]]] = { keys[i]:l[i] for i in xrange(0, len(l)) }

        self.metadata = md

        return md