Need help? Have a feature request? Please check out the
tethne-users group
.
Source code for tethne.tests.test_classes_feature
import sys
sys.path.append('./')
import unittest
from tethne.classes.feature import Feature, FeatureSet
import logging
logger = logging.getLogger('feature')
logger.setLevel('ERROR')
[docs]class TestFeature(unittest.TestCase):
[docs] def test_init_datum(self):
"""
Initialize with a single token.
"""
feature = Feature('bob')
self.assertEqual(len(feature), 1)
self.assertEqual(feature[0], ('bob', 1))
[docs] def test_init_list(self):
"""
Initialize with a list of tokens.
"""
feature = Feature(['bob', 'joe', 'bob', 'bobert', 'bob'])
self.assertEqual(len(feature), 3)
self.assertEqual(dict(feature)['bob'], 3)
self.assertEqual(dict(feature)['joe'], 1)
[docs] def test_init_counts(self):
"""
Initialize with a list of 2-tuple token values.
"""
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
self.assertEqual(len(feature), 3)
self.assertEqual(dict(feature)['bob'], 3)
self.assertEqual(dict(feature)['joe'], 1)
[docs] def test_init_tuples(self):
feature = Feature([('bob', 'dole'), ('roy', 'snaydon')])
self.assertEqual(len(feature), 2)
self.assertEqual(dict(feature)[('bob', 'dole')], 1)
[docs] def test_norm(self):
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
T = sum(list(zip(*feature))[1])
for n, r in zip(list(zip(*feature.norm))[1], list(zip(*feature))[1]):
self.assertEqual(n, float(r)/T)
[docs] def test_extend(self):
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature.extend([('bob', 1)])
self.assertEqual(feature.value('bob'), 4)
feature.extend(['bob'])
self.assertEqual(feature.value('bob'), 5)
feature.extend('bob')
self.assertEqual(feature.value('bob'), 6)
[docs] def test_iadd(self):
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature += [('bob', 1)]
self.assertEqual(feature.value('bob'), 4)
feature += ['bob']
self.assertEqual(feature.value('bob'), 5)
feature += 'bob'
self.assertEqual(feature.value('bob'), 6)
[docs] def test_isub(self):
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature -= [('bob', 1)]
self.assertEqual(feature.value('bob'), 2)
feature -= ['bob']
self.assertEqual(feature.value('bob'), 1)
feature -= 'bob'
self.assertEqual(feature.value('bob'), 0)
[docs]class TestFeatureSetWithData(unittest.TestCase):
[docs] def test_featureset(self):
f = Feature([(1585, 0.00054845065429964715), (1262, 0.00054605985306858213), (444, 0.00053942261617068057), (5106, 0.00053648963322009118), (206, 0.00053379098026327346), (1341, 0.00053329960378783244), (353, 0.00053110237444066769), (1498, 0.00052695505145953733), (1, 0.00052553534496093041)])
f2 = Feature([(1585, 0.00054845065429964715), (1262, 0.00054605985306858213), (444, 0.00053942261617068057), (5106, 0.00053648963322009118), (206, 0.00053379098026327346), (1341, 0.00053329960378783244), (353, 0.00053110237444066769), (1498, 0.00052695505145953733), (1, 0.00052553534496093041)])
fset = FeatureSet({'f': f, 'f2': f2})
top = fset.top(5)
self.assertIsInstance(top, list)
self.assertIsInstance(top[0], tuple)
self.assertEqual(len(top), 5)
print fset['f'].top(5)
[docs]class TestFeatureSet(unittest.TestCase):
[docs] def test_end_to_end_raw(self):
"""
Runs the Gensim LDA workflow
(https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation).
"""
from tethne.readers.wos import read
corpus = read('./tethne/tests/data/wos3.txt')
from nltk.tokenize import word_tokenize
corpus.index_feature('abstract', word_tokenize)
gensim_corpus, _ = corpus.features['abstract'].to_gensim_corpus(raw=True)
from gensim import corpora, models
dictionary = corpora.Dictionary(gensim_corpus)
corpus = [dictionary.doc2bow(text) for text in gensim_corpus]
model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
num_topics=5, update_every=1,
chunksize=100, passes=1)
model.print_topics()
[docs] def test_end_to_end(self):
"""
Runs the Gensim LDA workflow
(https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation).
"""
from tethne.readers.wos import read
corpus = read('./tethne/tests/data/wos3.txt')
from nltk.tokenize import word_tokenize
corpus.index_feature('abstract', word_tokenize)
gensim_corpus, id2word = corpus.features['abstract'].to_gensim_corpus()
from gensim import models
model = models.ldamodel.LdaModel(corpus=gensim_corpus, id2word=id2word,
num_topics=5, update_every=1,
chunksize=100, passes=1)
model.print_topics()
[docs] def test_init_empty(self):
"""
Initialize with no Features.
"""
logger.debug('FeatureSet should have 0 Features')
try:
featureset = FeatureSet()
featureset.__init__()
except:
self.fail()
[docs] def test_empty_feature(self):
feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([])
try:
featureset = FeatureSet({'p1': feature1, 'p2': feature2})
except Exception as E:
self.fail(E.message)
[docs] def test_init_features(self):
"""
Initialize with multiple features.
"""
logger.debug('FeatureSet should have 2 Features')
feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([('bob', 3), ('jane', 1), ('fido', 1)])
featureset = FeatureSet({'p1': feature1, 'p2': feature2})
self.assertEqual(len(featureset.features), 2)
expected = len(feature1.unique | feature2.unique)
self.assertEqual(len(featureset.index), expected)
self.assertEqual(len(featureset.lookup), expected)
self.assertEqual(len(featureset.counts), expected)
self.assertEqual(len(featureset.documentCounts), expected)
self.assertEqual(len(featureset.unique), expected)
self.assertEqual(featureset.documentCount('bob'), 2)
self.assertEqual(featureset.count('bob'), 6)
self.assertIn('p1', featureset.papers_containing('bob'))
self.assertIn('p2', featureset.papers_containing('bob'))
[docs] def test_transform(self):
"""
"""
feature1 = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([('bob', 3), ('jane', 1), ('fido', 1)])
featureset = FeatureSet({'p1': feature1, 'p2': feature2})
featureset_transformed = featureset.transform(lambda f, c, C, DC: c*3)
self.assertEqual(len(featureset_transformed.features), 2)
expected = len(featureset_transformed.unique | feature2.unique)
self.assertEqual(len(featureset_transformed.index), expected)
self.assertEqual(len(featureset_transformed.lookup), expected)
self.assertEqual(len(featureset_transformed.counts), expected)
self.assertEqual(len(featureset_transformed.documentCounts), expected)
self.assertEqual(len(featureset_transformed.unique), expected)
self.assertEqual(featureset_transformed.documentCount('bob'), 2)
self.assertEqual(featureset_transformed.count('bob'), 18)
self.assertIn('p1', featureset_transformed.papers_containing('bob'))
self.assertIn('p2', featureset_transformed.papers_containing('bob'))
[docs] def test_add_feature(self):
"""
Initialize empty, then add a feature.
"""
featureset = FeatureSet()
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
featureset.add('p1', feature)
self.assertEqual(len(featureset.features), 1)
expected = len(feature.unique)
self.assertEqual(len(featureset.index), expected)
self.assertEqual(len(featureset.lookup), expected)
self.assertEqual(len(featureset.counts), expected)
self.assertEqual(len(featureset.documentCounts), expected)
self.assertEqual(len(featureset.unique), expected)
self.assertIn('p1', featureset.papers_containing('bob'))
self.assertEqual(featureset.documentCount('bob'), 1)
self.assertEqual(featureset.count('bob'), 3)
# Do it again! There was some weirdness with the FeatureSet constructor.
featureset = FeatureSet()
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
featureset.add('p1', feature)
self.assertEqual(len(featureset.features), 1)
expected = len(feature.unique)
self.assertEqual(len(featureset.index), expected)
self.assertEqual(len(featureset.lookup), expected)
self.assertEqual(len(featureset.counts), expected)
self.assertEqual(len(featureset.documentCounts), expected)
self.assertEqual(len(featureset.unique), expected)
self.assertIn('p1', featureset.papers_containing('bob'))
self.assertEqual(featureset.documentCount('bob'), 1)
self.assertEqual(featureset.count('bob'), 3)
[docs] def test_top(self):
featureset = FeatureSet()
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
featureset.add('p1', feature)
featureset.add('p2', feature2)
featureset.add('p3', feature3)
N = 3
top = featureset.top(N)
self.assertIsInstance(top, list)
self.assertIsInstance(top[0], tuple)
self.assertEqual(len(top), N)
self.assertSetEqual(set(list(zip(*top))[0]), set(['blob', 'bob', 'joe']))
top = featureset.top(N, by='documentCounts')
self.assertIsInstance(top, list)
self.assertIsInstance(top[0], tuple)
self.assertEqual(len(top), N)
self.assertSetEqual(set(list(zip(*top))[0]), set(['blob', 'brobert', 'joe']))
[docs] def test_as_matrix(self):
featureset = FeatureSet()
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
featureset.add('p1', feature)
featureset.add('p2', feature2)
featureset.add('p3', feature3)
M = featureset.as_matrix()
self.assertEqual(len(M), len(featureset))
self.assertEqual(len(M[0]), len(featureset.unique))
[docs] def test_as_vector(self):
featureset = FeatureSet()
feature = Feature([('bob', 3), ('joe', 1), ('bobert', 1)])
feature2 = Feature([('blob', 3), ('joe', 1), ('brobert', 1)])
feature3 = Feature([('blob', 1), ('joe', 1), ('brobert', 1)])
featureset.add('p1', feature)
featureset.add('p2', feature2)
featureset.add('p3', feature3)
v = featureset.as_vector('p1')
v_norm = featureset.as_vector('p1', norm=True)
self.assertIsInstance(v, list)
self.assertIsInstance(v_norm, list)
self.assertEqual(len(v), len(v_norm))
self.assertEqual(len(v), len(featureset.unique))
self.assertGreater(sum(v), 0)
self.assertGreater(sum(v_norm), 0)
self.assertEqual(sum(v_norm), 1.0)
if __name__ == '__main__':
unittest.main()