SciPy
Need help? Have a feature request? Please check out the tethne-users group .

Source code for tethne.tests.test_classes_structuredfeature

import sys
sys.path.append('./')

from collections import Counter

import unittest

from tethne.classes.feature import StructuredFeature, StructuredFeatureSet


[docs]class TestStructuredFeatureSetToGensim(unittest.TestCase):
[docs] def test_to_gensim_corpus(self): tokens1 = [chr(i) for i in range(65, 250)] tokens2 = [chr(i) for i in range(65, 250)][::-1] contexts1 = [('sentence', [0, 25, 57, 89, 124, 156, 172]), ('paragraph', [0, 89, 172])] contexts2 = [('paragraph', [0, 101])] feature1 = StructuredFeature(tokens1, contexts1) feature2 = StructuredFeature(tokens2, contexts2) features = { 'first': feature1, 'second': feature2, } fset = StructuredFeatureSet(features) # print fset.index gensim_corpus, _ = fset.to_gensim_corpus('paragraph', raw=True) self.assertIsInstance(gensim_corpus, list) self.assertEqual(len(gensim_corpus), 5) self.assertIsInstance(gensim_corpus[0], list) self.assertIsInstance(gensim_corpus[0][0], str)
[docs] def test_end_to_end_raw(self): """ Runs the Gensim LDA workflow (https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation). """ from tethne.readers.wos import read corpus = read('./tethne/tests/data/wos3.txt') from nltk.tokenize import word_tokenize corpus.index_feature('abstract', word_tokenize, structured=True) gensim_corpus, _ = corpus.features['abstract'].to_gensim_corpus(raw=True) from gensim import corpora, models dictionary = corpora.Dictionary(gensim_corpus) corpus = [dictionary.doc2bow(text) for text in gensim_corpus] model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, update_every=1, chunksize=100, passes=1) model.print_topics()
[docs] def test_end_to_end(self): """ Runs the Gensim LDA workflow (https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation). """ from tethne.readers.wos import read corpus = read('./tethne/tests/data/wos3.txt') from nltk.tokenize import word_tokenize corpus.index_feature('abstract', word_tokenize, structured=True) gensim_corpus, id2word = corpus.features['abstract'].to_gensim_corpus() from gensim import corpora, models model = models.ldamodel.LdaModel(corpus=gensim_corpus, id2word=id2word, num_topics=5, update_every=1, chunksize=100, passes=1) model.print_topics()
[docs]class TestStructuredFeatureSet(unittest.TestCase):
[docs] def setUp(self): self.tokens1 = list(range(0, 205)) self.tokens2 = list(range(0, 42)) contexts1 = [('sentence', [0, 25, 57, 89, 124, 156, 172, 191]), ('paragraph', [0, 89, 172])] contexts2 = [('paragraph', [0, 29])] self.feature1 = StructuredFeature(self.tokens1, contexts1) self.feature2 = StructuredFeature(self.tokens2, contexts2)
[docs] def test_init(self): features = { 'first': self.feature1, 'second': self.feature2, } fset = StructuredFeatureSet(features) self.assertEqual(len(fset), len(features)) self.assertIsInstance(fset.unique, set) N_features_expected = max(len(self.tokens1), len(self.tokens2)) self.assertEqual(fset.N_features, N_features_expected) self.assertEqual(fset.N_documents, len(features)) expected_count = Counter(self.tokens1 + self.tokens2)[0] self.assertEqual(fset.count(0), expected_count) self.assertEqual(len(fset.papers_containing(0)), len(features))
[docs] def test_transform(self): features = { 'first': self.feature1, 'second': self.feature2, } fset = StructuredFeatureSet(features) xf = lambda f, c, fc, dc: f if f % 2 == 0 else False fset2 = fset.transform(xf) self.assertGreater(len(fset.index), len(fset2.index))
[docs] def test_select_context(self): features = { 'first': self.feature1, 'second': self.feature2, } fset = StructuredFeatureSet(features) N_paragraphs = len(fset.features['first'].contexts['paragraph']) + \ len(fset.features['second'].contexts['paragraph']) papers, chunks = fset.context_chunks('paragraph') self.assertEqual(len(chunks), N_paragraphs) self.assertEqual(len(papers), len(features))
[docs]class TestStructuredFeature(unittest.TestCase):
[docs] def setUp(self): self.testTokens = range(0, 50) self.testSentence = ('sentence', [0, 20, 27, 34]) self.testPara = ('paragraph', [0, 27])
[docs] def test_init(self): contexts = [self.testPara, self.testSentence] sfeature = StructuredFeature(self.testTokens, contexts) self.assertEqual(len(sfeature), len(self.testTokens)) self.assertEqual(len(sfeature.contexts), len(contexts))
[docs] def test_select(self): name, sentences = self.testSentence contexts = [self.testPara, self.testSentence] sfeature = StructuredFeature(self.testTokens, contexts) selected_sentences = sfeature['sentence'] self.assertIsInstance(sentences, list) self.assertEqual(len(sentences), len(sentences), """ __getitem__(context) should return all tokens, separated into context chunks.""")
[docs] def test_select_chunk(self): name, sentences =self.testSentence sentence_size = sentences[1] - sentences[0] contexts = [self.testPara, self.testSentence] sfeature = StructuredFeature(self.testTokens, contexts) selected_sentence = sfeature[('sentence', 0)] self.assertIsInstance(selected_sentence, list) self.assertEqual(len(selected_sentence), sentence_size, """ __getitem__((context, chunk)) should the tokens in that chunk""")
[docs] def test_add_context(self): name = 'orthogonal' indices = [0, 5, 22, 38] newContext = (name, indices) contexts = [self.testPara, self.testSentence] sfeature = StructuredFeature(self.testTokens, contexts) N_contexts_prior = len(sfeature.contexts) sfeature.add_context(*newContext) N_contexts_post = len(sfeature.contexts) self.assertGreater(N_contexts_post, N_contexts_prior) selected_sentences = sfeature['orthogonal'] self.assertEqual(len(selected_sentences), len(indices)) selected_sentence = sfeature['orthogonal', 1] self.assertEqual(len(selected_sentence), indices[2] - indices[1])
if __name__ == '__main__': unittest.main()