Source code for tethne.tests.test_readers_wos

import sys
sys.path.append('../tethne')

import re

import unittest
from tethne.readers.wos import WoSParser, read
from tethne import Corpus, Paper, StreamingCorpus

datapath = './tethne/tests/data/wos2.txt'
datapath_v = './tethne/tests/data/valentin.txt'

import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
    unicode = str


[docs]def is_number(value):
    try:
        int(value)
    except ValueError:
        try:
            float(value)
        except ValueError:
            return False
    return True


[docs]class TestWoSParserStreaming(unittest.TestCase):
[docs]    def test_read(self):
        corpus = read(datapath, streaming=True)
        self.assertIsInstance(corpus, StreamingCorpus)

        derror = "{0} should be {1}, but is {2}"
        dauthorAddressError = "{0} should be {1} or {2}, but is {3}"
        for e in corpus:
            if hasattr(e, 'date'):
                self.assertIsInstance(e.date, int,
                                      derror.format('date', 'int',
                                                    type(e.date)))
            uppererr = "Author names should be uppercase"
            if hasattr(e, 'authors_full'):
                self.assertIsInstance(e.authors_full, list,
                                      derror.format('authors_full', 'list',
                                                    type(e.authors_full)))
                for a in e.authors_full:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'authors_init'):
                self.assertIsInstance(e.authors_init, list,
                                      derror.format('authors_init', 'list',
                                                    type(e.authors_init)))
                for a in e.authors_init:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'journal'):
                self.assertIsInstance(e.journal, unicode,
                                      derror.format('journal', 'unicode',
                                                    type(e.journal)))
            if hasattr(e, 'abstract'):
                self.assertIsInstance(e.abstract, unicode,
                                      derror.format('abstract', 'unicode',
                                                    type(e.abstract)))
            if hasattr(e, 'WC'):
                self.assertIsInstance(e.WC, list,
                                      derror.format('WC', 'list',
                                                    type(e.WC)))
            if hasattr(e, 'subject'):
                self.assertIsInstance(e.subject, list,
                                      derror.format('subject', 'list',
                                                    type(e.subject)))
            if hasattr(e, 'authorKeywords'):
                self.assertIsInstance(e.authorKeywords, list,
                                      derror.format('authorKeywords', 'list',
                                                    type(e.authorKeywords)))
            if hasattr(e, 'keywordsPlus'):
                self.assertIsInstance(e.keywordsPlus, list,
                                      derror.format('keywordsPlus', 'list',
                                                    type(e.keywordsPlus)))
            if hasattr(e, 'doi'):
                self.assertIsInstance(e.doi, unicode,
                                      derror.format('doi', 'unicode',
                                                    type(e.doi)))
            if hasattr(e, 'volume'):
                self.assertIsInstance(e.volume, unicode,
                                      derror.format('volume', 'unicode',
                                                    type(e.volume)))

            if hasattr(e, 'title'):
                self.assertIsInstance(e.title, unicode,
                                      derror.format('title', 'unicode',
                                                    type(e.title)))

            if hasattr(e, 'authorAddress'):
                self.assertTrue((type(e.authorAddress) is list or type(e.authorAddress) is unicode),
                                dauthorAddressError.format('authorAddress', 'unicode',
                                                           'list', type(e.authorAddress)))

[docs]class TestWoSParser(unittest.TestCase):
[docs]    def test_read(self):
        corpus = read(datapath)
        self.assertIsInstance(corpus, Corpus)

[docs]    def test_read_nocorpus(self):
        papers = read(datapath, corpus=False)
        self.assertIsInstance(papers, list)
        self.assertIsInstance(papers[0], Paper)

[docs]    def test_parse(self):
        parser = WoSParser(datapath)
        parser.parse()

        # Check data types for the most common fields.
        derror = "{0} should be {1}, but is {2}"
        dauthorAddressError = "{0} should be {1} or {2}, but is {3}"
        for e in parser.data:
            if hasattr(e, 'date'):
                self.assertIsInstance(e.date, int,
                                      derror.format('date', 'int',
                                                    type(e.date)))
            uppererr = "Author names should be uppercase"
            if hasattr(e, 'authors_full'):
                self.assertIsInstance(e.authors_full, list,
                                      derror.format('authors_full', 'list',
                                                    type(e.authors_full)))
                for a in e.authors_full:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'authors_init'):
                self.assertIsInstance(e.authors_init, list,
                                      derror.format('authors_init', 'list',
                                                    type(e.authors_init)))
                for a in e.authors_init:
                    self.assertTrue(a[0].isupper(), uppererr)
                    self.assertTrue(a[1].isupper(), uppererr)
            if hasattr(e, 'journal'):
                self.assertIsInstance(e.journal, unicode,
                                      derror.format('journal', 'unicode',
                                                    type(e.journal)))
            if hasattr(e, 'abstract'):
                self.assertIsInstance(e.abstract, unicode,
                                      derror.format('abstract', 'unicode',
                                                    type(e.abstract)))
            if hasattr(e, 'WC'):
                self.assertIsInstance(e.WC, list,
                                      derror.format('WC', 'list',
                                                    type(e.WC)))
            if hasattr(e, 'subject'):
                self.assertIsInstance(e.subject, list,
                                      derror.format('subject', 'list',
                                                    type(e.subject)))
            if hasattr(e, 'authorKeywords'):
                self.assertIsInstance(e.authorKeywords, list,
                                      derror.format('authorKeywords', 'list',
                                                    type(e.authorKeywords)))
            if hasattr(e, 'keywordsPlus'):
                self.assertIsInstance(e.keywordsPlus, list,
                                      derror.format('keywordsPlus', 'list',
                                                    type(e.keywordsPlus)))
            if hasattr(e, 'doi'):
                self.assertIsInstance(e.doi, unicode,
                                      derror.format('doi', 'unicode',
                                                    type(e.doi)))
            if hasattr(e, 'volume'):
                self.assertIsInstance(e.volume, unicode,
                                      derror.format('volume', 'unicode',
                                                    type(e.volume)))

            if hasattr(e, 'title'):
                self.assertIsInstance(e.title, unicode,
                                      derror.format('title', 'unicode',
                                                    type(e.title)))

            if hasattr(e, 'authorAddress'):
                self.assertTrue((type(e.authorAddress) is list or type(e.authorAddress) is unicode),
                                dauthorAddressError.format('authorAddress', 'unicode',
                                                           'list', type(e.authorAddress)))

        # Check integrity of tag-to-field mapping.
        for tag, attr in parser.tags.iteritems():
            self.assertFalse(hasattr(e, tag),
                             ' '.join(['{0} should map to'.format(tag),
                                       '{0}, but does not.'.format(attr)]))

        # Check number of records.
        N = len(parser.data)
        self.assertEqual(N, 100, 'Expected 100 entries, found {0}.'.format(N))

        self.assertTrue(hasattr(parser.data[0], 'citedReferences'))
        for cr in parser.data[0].citedReferences:
            self.assertTrue(hasattr(cr, 'date'))
            if cr.date:
                self.assertIsInstance(cr.date, int)
            self.assertTrue(hasattr(cr, 'journal'))

[docs]class TestWithStarCR(unittest.TestCase):
[docs]    def setUp(self):
        class TestParser(WoSParser):
            def handle_CR(self, value):
                citation = super(TestParser, self).handle_CR(value)
                # The lastname part of the first author name should match the
                #  CR author name as written, but without the leading *.
                datematch = re.match('([0-9]{4})', value)
                if datematch:
                    date = datematch.group(1)
                    if len(citation.authors) > 0:
                        last = [unicode(v[0]) for v
                                in zip(*zip(*citation.authors)[0])]
                        assert unicode(date) not in last

                if value.startswith('*'):
                    expected = re.match('\*([\w\s]+)\,', value).group(1)
                    assert expected == citation.authors[0][0][0]
                    assert not is_number(citation.authors[0][0][0])

        self.parser = TestParser(datapath_v)

[docs]    def test_citedRefs(self):
        try:
            self.parser.parse()
        except AssertionError:
            self.fail('Trouble processing cited references')



if __name__ == '__main__':
    unittest.main()