SciPy

Source code for tethne.model.corpus.mallet

"""
Classes and methods related to the :class:`.MALLETModelManager`\.
"""

import os
import re
import shutil
import tempfile
import subprocess
import csv
import platform
from collections import defaultdict

from networkx import Graph

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str

import logging
logging.basicConfig()
logger = logging.getLogger('mallet')
logger.setLevel('ERROR')

from tethne import write_documents, Feature, FeatureSet
from tethne.model import Model
import tethne

# Determine path to MALLET.
TETHNE_PATH = os.path.split(os.path.abspath(tethne.__file__))[0]
MALLET_PATH = os.path.join(TETHNE_PATH, 'bin', 'mallet-2.0.7')

import sys
if sys.version_info[0] > 2:
    xrange = range


[docs]class LDAModel(Model): """ Generates a :class:`.LDAModel` from a :class:`.Corpus` using `MALLET <http://mallet.cs.umass.edu/>`_. The :class:`.Corpus` should already contain at least one featurset, indicated by the `feature` parameter, such as wordcounts. You may specify two working directories: `temppath` should be a working directory that will contain intermediate files (e.g. documents, data files, metadata), while `outpath` will contain the final model and any plots generated during the modeling process. If `temppath` is not provided, generates and uses a system temporary directory. Tethne comes bundled with a recent version of MALLET. If you would rather use your own install, you can do so by providing the `mallet_path` parameter. This should point to the directory containing ``/bin/mallet``. .. autosummary:: :nosignatures: topic_over_time Parameters ---------- D : :class:`.Corpus` feature : str Key from D.features containing wordcounts (or whatever you want to model with). outpath : str Path to output directory. temppath : str Path to temporary directory. mallet_path : str Path to MALLET install directory (contains bin/mallet). Examples -------- Starting with some JSTOR DfR data (with wordcounts), a typical workflow might look something like this: .. code-block:: python >>> from nltk.corpus import stopwords # 1. Get stoplist. >>> stoplist = stopwords.words() >>> from tethne.readers import dfr # 2. Build Corpus. >>> C = dfr.corpus_from_dir('/path/to/DfR/datasets', 'uni', stoplist) >>> def filt(s, C, DC): # 3. Filter wordcounts. ... if C > 3 and DC > 1 and len(s) > 3: ... return True ... return False >>> C.filter_features('wordcounts', 'wc_filtered', filt) >>> from tethne.model import MALLETModelManager # 4. Get Manager. >>> outpath = '/path/to/my/working/directory' >>> mallet = '/Applications/mallet-2.0.7' >>> M = MALLETModelManager(C, 'wc_filtered', outpath, mallet_path=mallet) >>> M.prep() # 5. Prep model. >>> model = M.build(Z=50, max_iter=300) # 6. Build model. >>> model # (may take awhile) <tethne.model.corpus.ldamodel.LDAModel at 0x10bfac710> A plot showing the log-likelihood/topic over modeling iterations should be generated in your `outpath`. For example: .. figure:: _static/images/ldamodel_LL.png :width: 400 :align: center Behind the scenes, the :func:`.prep` procedure generates a plain-text corpus file at `temppath`, along with a metadata file. MALLET's ``import-file`` procedure is then called, which translates the corpus into MALLET's internal format (also stored at the `temppath`). The :func:`.build` procedure then invokes MALLET's ``train-topics`` procedure. This step may take a considerable amount of time, anywhere from a few minutes (small corpus, few topics) to a few hours (large corpus, many topics). For a :class:`.Corpus` with a few thousand :class:`.Paper`\s, 300 - 500 iterations is often sufficient to achieve convergence for 20-100 topics. Once the :class:`.LDAModel` is built, you can access its methods directly. See full method descriptions in :class:`.LDAModel`\. For more information about topic modeling with MALLET see `this tutorial <http://programminghistorian.org/lessons/topic-modeling-and-mallet>`_. """ mallet_path = MALLET_PATH def __init__(self, *args, **kwargs): self.mallet_bin = os.path.join(self.mallet_path, "bin", "mallet") if platform.system() == 'Windows': self.mallet_bin += '.bat' os.putenv('MALLET_HOME', self.mallet_path) super(LDAModel, self).__init__(*args, **kwargs)
[docs] def prep(self): self.dt = os.path.join(self.temp, "dt.dat") self.wt = os.path.join(self.temp, "wt.dat") self.om = os.path.join(self.temp, "model.mallet") self._generate_corpus()
def _generate_corpus(self): """ Writes a corpus to disk amenable to MALLET topic modeling. """ target = self.temp + 'mallet' paths = write_documents(self.corpus, target, self.featureset_name, ['date', 'title']) self.corpus_path, self.metapath = paths self._export_corpus() def _export_corpus(self): """ Calls MALLET's `import-file` method. """ # bin/mallet import-file --input /Users/erickpeirson/mycorpus_docs.txt # --output mytopic-input.mallet --keep-sequence --remove-stopwords if not os.path.exists(self.mallet_bin): raise IOError("MALLET path invalid or non-existent.") self.input_path = os.path.join(self.temp, "input.mallet") exit = subprocess.call([ self.mallet_bin, 'import-file', '--input', self.corpus_path, '--output', self.input_path, '--keep-sequence', # Required for LDA. '--remove-stopwords']) # Probably redundant. if exit != 0: msg = "MALLET import-file failed with exit code {0}.".format(exit) raise RuntimeError(msg)
[docs] def run(self, **kwargs): """ Calls MALLET's `train-topic` method. """ #$ bin/mallet train-topics --input mytopic-input.mallet #> --num-topics 100 #> --output-doc-topics /Users/erickpeirson/doc_top #> --word-topic-counts-file /Users/erickpeirson/word_top #> --output-topic-keys /Users/erickpeirson/topic_keys if not os.path.exists(self.mallet_bin): raise IOError("MALLET path invalid or non-existent.") for attr in ['Z', 'max_iter']: if not hasattr(self, attr): raise AttributeError('Please set {0}'.format(attr)) self.ll = [] self.num_iters = 0 logger.debug('run() with k={0} for {1} iterations'.format(self.Z, self.max_iter)) prog = re.compile(u'\<([^\)]+)\>') ll_prog = re.compile(r'(\d+)') p = subprocess.Popen([ self.mallet_bin, 'train-topics', '--input', self.input_path, '--num-topics', unicode(self.Z), '--num-iterations', unicode(self.max_iter), '--output-doc-topics', self.dt, '--word-topic-counts-file', self.wt, '--output-model', self.om], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Handle output of MALLET in real time. while p.poll() is None: l = p.stderr.readline() # Keep track of LL/topic. try: this_ll = float(re.findall(u'([-+]\d+\.\d+)', l)[0]) self.ll.append(this_ll) except IndexError: # Not every line will match. pass # Keep track of modeling progress. try: this_iter = float(prog.match(l).groups()[0]) progress = int(100. * this_iter/self.max_iter) print 'Modeling progress: {0}%.\r'.format(progress), except AttributeError: # Not every line will match. pass self.num_iters += self.max_iter self.load()
[docs] def load(self, **kwargs): self._read_theta(kwargs.get('dt', self.dt)) self._read_phi(kwargs.get('wt', self.wt))
def _read_theta(self, dt): """ Used by :func:`.from_mallet` to reconstruct theta posterior distributions. Returns ------- td : Numpy array Rows are documents, columns are topics. Rows sum to ~1. """ self.theta = FeatureSet() with open(dt, "rb") as f: i = -1 reader = csv.reader(f, delimiter='\t') for line in reader: i += 1 if i == 0: continue # Avoid header row. d, id, t = int(line[0]), unicode(line[1]), line[2:] feature = Feature([(int(t[i]), float(t[i + 1])) for i in xrange(0, len(t) - 1, 2)]) self.theta.add(id, feature) self.corpus.features['topics'] = self.theta return self.theta def _read_phi(self, wt): """ Used by :func:`.from_mallet` to reconstruct phi posterior distributions. Returns ------- wt : Numpy array Rows are topics, columns are words. Rows sum to ~1. """ self.vocabulary = {} phi_features = {} # TODO: make this encoding-safe. with open(wt, "r") as f: reader = csv.reader(f, delimiter=' ') topics = defaultdict(list) for line in reader: w, term = int(line[0]), unicode(line[1]) self.vocabulary[w] = term for l in line[2:]: k, c = l.split(':') # Topic and assignment count. topics[int(k)].append((w, int(c))) for k, data in topics.iteritems(): nfeature = Feature(data).norm phi_features[k] = nfeature self.phi = FeatureSet(phi_features)
[docs] def topics_in(self, d, topn=5): """ List the top ``topn`` topics in document ``d``. """ return self.theta.features[d].top(topn)
[docs] def list_topic(self, k, Nwords=10): """ List the top ``topn`` words for topic ``k``. Examples -------- .. code-block:: python >>> model.list_topic(1, Nwords=5) [ 'opposed', 'terminates', 'trichinosis', 'cistus', 'acaule' ] """ return [(self.vocabulary[w], p) for w, p in self.phi.features[k].top(Nwords)]
[docs] def list_topics(self, Nwords=10): """ List the top ``Nwords`` words for each topic. """ return [(k, self.list_topic(k, Nwords)) for k in xrange(len(self.phi))]
[docs] def print_topics(self, Nwords=10): """ Print the top ``Nwords`` words for each topic. """ print('Topic\tTop %i words' % Nwords) for k, words in self.list_topics(Nwords): print(unicode(k).ljust(3) + '\t' + ' '.join(list(zip(*words))[0]))
[docs] def topic_over_time(self, k, mode='counts', slice_kwargs={}): """ Calculate the representation of topic ``k`` in the corpus over time. """ return self.corpus.feature_distribution('topics', k, mode=mode, **slice_kwargs)