SciPy

Source code for tethne.model.managers.dtm

"""
Classes and methods related to the :class:`.DTMModelManager`\.
"""

import os
import re
import shutil
import tempfile
import subprocess
import numpy as np

from networkx import Graph

import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel('ERROR')

from ...classes import GraphCollection
from ..social import TAPModel
from ..managers import ModelManager
from ..corpus.dtmmodel import from_gerrish

[docs]class DTMModelManager(ModelManager): """ Generates a :class:`.DTMModel` from a :class:`.Corpus` using `Gerrish's C++ implementation <http://code.google.com/p/princeton-statistical-learning/downloads/detail?name=dtm_release-0.8.tgz>`_. You should be sure to slice your :class:`.Corpus` by 'date' using the 'time_period' method (for details, see :meth:`.Corpus.slice`\). .. autosummary:: :nosignatures: plot_topic_evolution topic_over_time Parameters ---------- D : :class:`.Corpus` outpath : str Path to output directory. dtm_path : str Path to MALLET install directory (contains bin/mallet). Examples -------- Starting with some JSTOR DfR data (with wordcounts), a typical workflow might look something like this: .. code-block:: python >>> from nltk.corpus import stopwords # 1. Get stoplist. >>> stoplist = stopwords.words() >>> from tethne.readers import dfr # 2. Build Corpus. >>> C = dfr.corpus_from_dir('/path/to/DfR/datasets', 'uni', stoplist) >>> def filt(s, C, DC): # 3. Filter wordcounts. ... if C > 3 and DC > 1 and len(s) > 3: ... return True ... return False >>> C.filter_features('wordcounts', 'wc_filtered', filt) >>> C.slice('date', 'time_period', window_size=5) # 4. Slice Corpus. >>> from tethne.model import DTMModelManager # 5. Get Manager. >>> outpath = '/path/to/my/working/directory' >>> dtm = '/path/to/dtm/bin/main' >>> M = DTMModelManager(C, 'wc_filtered', outpath, dtm_path=dtm) >>> M.prep() # 6. Prep model. >>> model = M.build(Z=50) # 7. Build model. >>> model # (may take awhile) <tethne.model.corpus.dtmmodel.DTMModel at 0x10bfac710> A plot showing the log-likelihood/topic over modeling iterations should be generated in your `outpath`. For example: .. figure:: _static/images/dtmmodel_LL.png :width: 400 :align: center Behind the scenes, the :func:`.prep` procedure generates data files at ``temppath`` describing your :class:`.Corpus`\: * ``tethne-vocab.dat`` contains all of the words in the corpus, one per line. * ``tethne-mult.dat`` contains wordcounts for each document; words are represented by integer indices corresponding to line numbers in ``tethne-vocab.dat``. Documents are ordered by publication date (earliest to latest). * ``tethne-seq.dat`` describes how documents are to be apportioned among time-periods. The first line is the number of time periods, and the subsequent lines specify the number of documents in each successive time-period. * ``tethne-meta.dat`` is a tab-delimted metadata file. Those records occur in the same order as in the documents in ``tethne-mult.dat``. For example:: id date atitle 10.2307/2437162 1945 SOME ECOTYPIC RELATIONS OF DESCHAMPSIA CAESPITOSA 10.2307/4353229 1940 ENVIRONMENTAL INFLUENCE AND TRANSPLANT EXPERIMENTS 10.2307/4353158 1937 SOME FUNDAMENTAL PROBLEMS OF TAXONOMY AND PHYLOGENETICS The :func:`.build` procedure then starts the DTM modeling algorithm. This step may take a considerable amount of time, anywhere from a few minutes (small corpus, few topics) to a few hours (large corpus, many topics). **Warning:** this implementation of DTM is known to run into memory issues with large vocabularies. If a memory-leak does occur, try using a more restrictive filter to the featureset, using :func:`.Corpus.filter_features`\. Once the :class:`.DTMModel` is built, you can access its methods directly. See full method descriptions in :class:`.DTMModel`\. Of special interest are: .. currentmodule:: tethne.model.corpus.dtmmodel .. autosummary:: :nosignatures: DTMModel.list_topic_diachronic DTMModel.print_topic_diachronic DTMModel.topic_evolution To plot the evolution of a topic over time, use :func:`.plot_topic_evolution`\. .. code-block:: python >>> M.plot_topic_evolution(2, plot=True) ...should generate a plot at ``outpath`` called ``topic_2_evolution.png``: .. figure:: _static/images/topic_2_evolution.png :width: 400 :align: center """ def __init__(self, D, feature='unigrams', outpath='/tmp', temppath=None, dtm_path='./bin/main'): """ """ super(DTMModelManager, self).__init__(outpath, temppath) self.D = D self.dtm_path = dtm_path self.feature = feature self.outname = '{0}/model_run'.format(self.outpath) self.mult_path = '{0}/tethne-mult.dat'.format(self.temp) self.seq_path = '{0}/tethne-seq.dat'.format(self.temp) self.vocab_path = '{0}/tethne-vocab.dat'.format(self.temp) self.meta_path = '{0}/tethne-meta.dat'.format(self.temp) def _generate_corpus(self, meta): from tethne.writers.corpora import to_dtm_input to_dtm_input(self.temp+'/tethne', self.D, self.feature, fields=meta) def _run_model(self, **kwargs): ## Run the dynamic topic model. #./main \ # --ntopics=20 \ # --mode=fit \ # --rng_seed=0 \ # --initialize_lda=true \ # --corpus_prefix=example/test \ # --outname=example/model_run \ # --top_chain_var=0.005 \ # --alpha=0.01 \ # --lda_sequence_min_iter=6 \ # --lda_sequence_max_iter=20 \ # --lda_max_em_iter=10 top_chain_var = kwargs.get('top_chain_var', 0.005) lda_seq_min_iter = kwargs.get('lda_seq_min_iter', 6) lda_seq_max_iter = kwargs.get('lda_seq_max_iter', 20) lda_max_em_iter = kwargs.get('lda_max_em_iter', 10) alpha = kwargs.get('alpha', 0.01) max_v = lda_seq_min_iter*lda_max_em_iter*len(self.D.get_slices('date')) self.conv = [] i = 1 corpus_prefix = '{0}/tethne'.format(self.temp) FNULL = open(os.devnull, 'w') p = subprocess.Popen( [ self.dtm_path, '--ntopics={0}'.format(self.Z), '--mode=fit', '--rng_seed=0', '--initialize_lda=true', '--corpus_prefix={0}'.format(corpus_prefix), '--outname={0}'.format(self.outname), '--top_chain_var={0}'.format(top_chain_var), '--alpha={0}'.format(alpha), '--lda_sequence_min_iter={0}'.format(lda_seq_min_iter), '--lda_sequence_max_iter={0}'.format(lda_seq_max_iter), '--lda_max_em_iter={0}'.format(lda_max_em_iter) ], stderr=subprocess.PIPE, stdout=FNULL) while p.poll() is None: l = p.stderr.readline() try: # Find the LL this_ll = float(re.findall(r'^lhood\s+=\s+([-]?\d+\.\d+)', l)[0]) self.ll.append(this_ll) self.ll_iters.append(i) i += 1 except IndexError: pass try: # Find conv conv = re.findall(r'conv\s+=\s+([-]?\d+\.\d+e[-]\d+)', l) self.conv.append(float(conv[0])) progress = int(100 * float(len(self.conv))/float(max_v)) except IndexError: pass self.num_iters += lda_max_em_iter # TODO: does this make sense? def _load_model(self): """Load and return a :class:`.DTMModel`\.""" self.model = from_gerrish(self.outname, self.meta_path, self.vocab_path) self.vocabulary = self.model.vocabulary return self.model
[docs] def plot_topic_evolution(self, k, Nwords=5, plot=False, figargs={'figsize':(10,10)} ): """ Plot the probability of the top ``Nwords`` words in topic ``k`` over time. If ``plot`` is True, generates a plot image at ``outpath``. TODO: should return a Figure object. Parameters ---------- k : int Topic index. Nwords : int Number of words to include in plot. plot : bool (default: False) If True, generates a plot image at ``outpath``. figargs : dict Keyword arguments to pass to :func:`matplotlib.pyplot.plot`\. Returns ------- keys : list Start-date of each time-period. t_series : list Array of p(w|t) for Nwords for each time-period. Examples -------- .. code-block:: python >>> M.plot_topic_evolution(2, plot=True) ...should generate a plot at ``outpath`` called ``topic_2_evolution.png``: .. figure:: _static/images/topic_2_evolution.png :width: 400 :align: center """ t_keys, t_series = self.model.topic_evolution(k, Nwords) slices = self.D.get_slices('date') keys = sorted(slices.keys()) if plot: import matplotlib.pyplot as plt fig = plt.figure(**figargs) ax = fig.add_axes([0.1, 0.1, 0.6, 0.75]) for word, series in t_series.iteritems(): plt.plot(keys, series, label=word) plt.xlabel('Time Slice') plt.ylabel('Probability of word in topic') plt.title('Evolution of topic {0}'.format(k)) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xlim(keys[0], keys[-1]) plt.savefig('{0}/topic_{1}_evolution.png'.format(self.outpath, k)) return keys, t_series
[docs] def topic_over_time(self, k, threshold=0.05, mode='documents', normed=True, plot=False, figargs={'figsize':(10,10)} ): """ Representation of topic ``k`` over 'date' slice axis. The :class:`.Corpus` used to initialize the :class:`.DTMModelManager` must have been already sliced by 'date'. Parameters ---------- k : int Topic index. threshold : float Minimum representation of ``k`` in a document. mode : str 'documents' counts the number documents that contain ``k``; 'proportions' sums the representation of ``k`` in each document that contains it. normed : bool (default: True) Normalizes values by the number of documents in each slice. plot : bool (default: False) If True, generates a MatPlotLib figure and saves it to the :class:`MALLETModelManager` outpath. figargs : dict kwargs dict for :func:`matplotlib.pyplot.figure`\. Returns ------- keys : array Keys into 'date' slice axis. R : array Representation of topic ``k`` over time. Examples -------- .. code-block:: python >>> keys, repr = M.topic_over_time(1, plot=True) ...should return ``keys`` (date) and ``repr`` (% documents) for topic 1, and generate a plot like this one in your ``outpath``. .. figure:: _static/images/topic_1_over_time.png :width: 400 :align: center """ if k >= self.model.Z: raise ValueError('No such topic in this model.') items = self.model.dimension_items(k, threshold) slices = self.D.get_slices('date') keys = sorted(slices.keys()) R = [] topic_label = self.model.print_topic(k,0) if mode == 'documents': # Documents that contain k. for t in keys: docs = slices[t] Ndocs = float(len(docs)) Ncontains = 0. for i,w in items: if i in docs: Ncontains += 1. if normed: # As a percentage of docs in each slice. ylabel = 'Percentage of documents containing topic.' if Ndocs > 0.: R.append( Ncontains/Ndocs ) else: R.append( 0. ) else: # Raw count. ylabel = 'Number of documents containing topic.' R.append( Ncontains ) elif mode == 'proportions': # Representation of topic k. for t in keys: docs = slices[t] Ndocs = float(len(docs)) if normed: # Normalized by number of docs in each slice. ylabel = 'Normed representation of topic in documents.' if Ndocs > 0.: R.append( sum([ w for i,w in items if i in docs ]) /Ndocs ) else: R.append( 0. ) else: ylabel = 'Sum of topic representation in documents.' R.append( sum([ w for i,w in items if i in docs ]) ) if plot: # Generates a simple lineplot and saves it in the outpath. import matplotlib.pyplot as plt fig = plt.figure(**figargs) plt.plot(np.array(keys), np.array(R)) plt.xlabel('Time Slice') plt.ylabel(ylabel) # Set based on mode. plt.title(topic_label) plt.savefig('{0}/topic_{1}_over_time.png'.format(self.outpath, k)) return np.array(keys), np.array(R)