SciPy

Source code for tethne.managers.corpusmanager

from collectionmanager import CollectionManager

from ..classes import Corpus

[docs]class CorpusManager(CollectionManager): """ Base class for Corpus managers. """ def __init__(self, datapath, **kwargs): """ Parameters ---------- datapath : str Path to data. """ super(CorpusManager, self).__init__(**kwargs) self.datapath = datapath
[docs]class SampleDFRManager(CorpusManager): """ Sample :class:`.CorpusManager` for JSTOR Data-for-Research datasets. """ slice_axis='date' slice_method = 'time_window' window_size = 4 step_size = 1 slice_axis2 = 'jtitle' gram_type = 'uni'
[docs] def prep(self): from ..readers import dfr self.papers = dfr.read(self.datapath) self.features = { 'unigrams':dfr.ngrams(self.datapath, self.gram_type) }
[docs] def build(self): from nltk.corpus import stopwords exclude = set(stopwords.words()) self.D = Corpus(self.papers, self.features, index_by='doi', exclude=exclude) self.D.slice(self.slice_axis, method=self.slice_method, window_size=self.window_size, step_size=self.step_size) self.D.slice(self.slice_axis2)
[docs] def write(self, target='./distribution.png'): import matplotlib.pyplot as plt fig = plt.figure(figsize=(25,10)) self.D.plot_distribution(self.slice_axis2, self.slice_axis, fig=fig, interpolation='none') plt.savefig(target)