import sys
sys.path.append('../tethne')
import unittest
from tethne.readers import merge
from tethne.readers.dfr import read, ngrams, _handle_author,_dfr2paper_map,_create_ayjid,_handle_pagerange,tokenize,_handle_authors,_handle_paper
from tethne import Corpus, Paper, FeatureSet
import xml.etree.ElementTree as ET
datapath = './tethne/tests/data/dfr'
datapath_float_weights = './tethne/tests/data/dfr_float_weights'
sample_datapath = './tethne/tests/data/test_citations_sample.xml'
[docs]class TestDFRReaderStreaming(unittest.TestCase):
[docs] def test_read(self):
corpus = read(datapath, streaming=True)
self.assertIsInstance(corpus, Corpus)
for e in corpus:
if hasattr(e, 'date'):
self.assertIsInstance(e.date, int)
if hasattr(e, 'authors_init'):
self.assertIsInstance(e.authors_init, list)
for a in e.authors_init:
self.assertTrue(a[0].isupper(), uppererr)
self.assertTrue(a[1].isupper(), uppererr)
if hasattr(e, 'journal'):
self.assertIsInstance(e.journal, str)
if hasattr(e, 'abstract'):
self.assertIsInstance(e.abstract, str)
if hasattr(e, 'authorKeywords'):
self.assertIsInstance(e.authorKeywords, list)
if hasattr(e, 'keywordsPlus'):
self.assertIsInstance(e.keywordsPlus, list)
if hasattr(e, 'doi'):
self.assertIsInstance(e.doi, str)
if hasattr(e, 'volume'):
self.assertIsInstance(e.volume, str)
if hasattr(e, 'title'):
self.assertIsInstance(e.title, str)
[docs]class TestDFRReader(unittest.TestCase):
[docs] def test_read(self):
corpus = read(datapath)
self.assertIsInstance(corpus, Corpus)
for e in corpus.papers:
if hasattr(e, 'date'):
self.assertIsInstance(e.date, int)
if hasattr(e, 'authors_init'):
self.assertIsInstance(e.authors_init, list)
for a in e.authors_init:
self.assertTrue(a[0].isupper(), uppererr)
self.assertTrue(a[1].isupper(), uppererr)
if hasattr(e, 'journal'):
self.assertIsInstance(e.journal, str)
if hasattr(e, 'abstract'):
self.assertIsInstance(e.abstract, str)
if hasattr(e, 'authorKeywords'):
self.assertIsInstance(e.authorKeywords, list)
if hasattr(e, 'keywordsPlus'):
self.assertIsInstance(e.keywordsPlus, list)
if hasattr(e, 'doi'):
self.assertIsInstance(e.doi, str)
if hasattr(e, 'volume'):
self.assertIsInstance(e.volume, str)
if hasattr(e, 'title'):
self.assertIsInstance(e.title, str)
self.assertIn('wordcounts', corpus.features)
self.assertGreaterEqual(len(corpus),
len(corpus.features['wordcounts']))
[docs]class TestNGrams(unittest.TestCase):
[docs] def test_ngrams(self):
grams = ngrams(datapath, 'wordcounts')
self.assertIsInstance(grams, FeatureSet)
self.assertEqual(len(grams), 398)
self.assertEqual(len(grams.index), 105156)
[docs] def test_float_weights(self):
"""
Some DfR features have floating-point weights, rather than ints.
"""
grams = ngrams(datapath_float_weights, 'keyterms')
self.assertIsInstance(grams, FeatureSet)
self.assertEqual(len(grams), 2)
self.assertEqual(len(grams.index), 43)
[docs]class TestCitationFile(unittest.TestCase):
[docs] def test_citations_file(self):
datapath2 = './tethne/tests/data/dfr2'
self.assertIsInstance(read(datapath2), Corpus)
[docs]class TestHandleAuthor(unittest.TestCase):
[docs] def test_handle_author_NOJR(self):
self.assertEqual(('TERRELL', 'E'),_handle_author("Edward E. Terrell"))
[docs] def test_handle_author_JR(self):
self.assertEqual(('STEBBINS, JR', 'G'),_handle_author("G. Ledyard Stebbins, Jr."))
[docs]class TestDfr2PaperMap(unittest.TestCase):
[docs] def test_dfr2paper(self):
local_dict = { 'doi': 'doi','title': 'atitle','journaltitle': 'jtitle','volume': 'volume','issue': 'issue' }
self.assertEqual(local_dict, _dfr2paper_map())
[docs]class TestCreateAyijid(unittest.TestCase):
[docs] def test_no_aulast(self):
self.assertEqual(' R ',_create_ayjid(None,['R'],None,None))
[docs] def test_no_auinit(self):
self.assertEqual('NIXON ',_create_ayjid(['NIXON'],None,None,None))
[docs] def test_all_None_args(self):
self.assertEqual('UNKNOWN PAPER',_create_ayjid(None,None,None,None))
[docs]class TestHandlePageRange(unittest.TestCase):
[docs] def test_handle_pagerange_noNumbers(self):
input_pagerange = 'pp.efcadd'
req_pagerange = (u'0',u'0')
self.assertEqual(req_pagerange,_handle_pagerange(input_pagerange))
[docs] def test_handle_pagerange(self):
input_pagerange = 'pp. 111-999'
req_pagerange = (u'111',u'999')
self.assertEqual(req_pagerange,_handle_pagerange(input_pagerange))
[docs]class TestHandleAuthors(unittest.TestCase):
"""testing the functionality when the input parameter is list"""
[docs] def test_handle_authors_list(self):
exp_aulast = ['STROMNAES', 'GARBER']
exp_auinit = ['C', 'E']
self.assertEqual(exp_aulast,_handle_authors(['Cistein Stromnaes', 'E. D. Garber'])[0])
self.assertEqual(exp_auinit,_handle_authors(['Cistein Stromnaes', 'E. D. Garber'])[1])
"""testing the functionality when the input parameter is String"""
[docs] def test_handle_authors_String(self):
exp_aulast = ['YARNELL']
exp_auinit = ['S']
self.assertEqual(exp_aulast,_handle_authors('S. H. Yarnell')[0])
self.assertEqual(exp_auinit,_handle_authors('S. H. Yarnell')[1])
[docs]class TestHandlePaper(unittest.TestCase):
[docs] def test_handle_Paper(self):
with open(sample_datapath, 'r') as f:
root = ET.fromstring(f.read())
pattern = './/{elem}'.format(elem='article')
elements = root.findall(pattern)
presentPaper = _handle_paper(elements[0])
self.assertIsInstance(presentPaper,Paper)
self.assertEqual(1954,presentPaper.__getitem__('date'))
if __name__ == '__main__':
unittest.main()