Need help? Have a feature request? Please check out the
tethne-users group
.
Source code for tethne.tests.test_readers_zotero
import sys
sys.path.append('../tethne')
import re
import unittest
from tethne.readers.zotero import read, ZoteroParser, _infer_spaces
from tethne import Corpus, Paper, StructuredFeatureSet
import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
unicode = str
datapath = './tethne/tests/data/zotero/zotero.rdf'
datapath2 = './tethne/tests/data/zotero2/zotero2.rdf'
datapath3 = './tethne/tests/data/zotero_withfiles/zotero_withfiles.rdf'
duplicatePath = './tethne/tests/data/Duplicate'
[docs]class TestInferSpaces(unittest.TestCase):
[docs] def test_infer(self):
s = "Thisisastringwithnospaces."
self.assertEqual(_infer_spaces(s), 'this is a string with no spaces .')
#
# Incorporating PDF extraction into Tethne is a bit too far beyond the scope of
# the project. We should focus on making it easy for people to work with
# plain text corpora.
#
# class TestZoteroParserWithFiles(unittest.TestCase):
# """
# When Tethne reads a Zotero collection, it should attempt to extract
# full-text content for the constituent bibliographic records.
# """
#
# def test_read_pdf(self):
# corpus = read(datapath3, follow_links=True)
#
# self.assertIsInstance(corpus, Corpus)
#
# self.assertIn('pdf_text', corpus.features,
# """
# If a dataset has full-text content available in PDFs, then
# 'structuredfeatures' should contain an element called 'pdf_text'.
# """)
#
# self.assertIsInstance(corpus.features['pdf_text'],
# StructuredFeatureSet,
# """
# 'pdf_text' should be an instance of StructuredFeatureSet.
# """)
#
# self.assertEqual(len(corpus.features['pdf_text']), 7,
# """
# There should be seven (7) full-text pdf StructuredFeatures for this
# particular dataset.
# """)
[docs]class TestZoteroDuplicates(unittest.TestCase):
[docs] def test_duplicate_Papers_length(self):
"""
Tests for user-warning raised in case of duplicate papers in a Corpus.
Definition of duplicate papers is : Papers which have the same index_by field value.
Example :
Two papers in a Zotero collection, with the same URI, are duplicates
Two papers from World of Science with the same WOSID are duplicates
Returns
-------
Fails when the attribute duplicate_papers(Dictionary) is not populated.
duplicate_papers['http://www.jstor.org/stable/2460126'] = 2
This means there are 2 papers with the URI 'http://www.jstor.org/stable/2460126'
"""
corpus = read(duplicatePath, corpus=True)
# self.assertGreater(len(corpus.duplicate_papers), 0)
# self.assertEqual(corpus.duplicate_papers['http://www.jstor.org/stable/2460126'], 2)
[docs]class TestZoteroParser(unittest.TestCase):
[docs] def test_read(self):
corpus = read(datapath, follow_links=True)
self.assertIsInstance(corpus, Corpus)
[docs] def test_read_files(self):
# TODO: attempt to read contents of files?
corpus = read(datapath2, index_by='uri')
self.assertIsInstance(corpus, Corpus)
[docs] def test_read_nocorpus(self):
papers = read(datapath, corpus=False)
self.assertIsInstance(papers, list)
self.assertIsInstance(papers[0], Paper)
[docs] def test_authors(self):
"""
Tests for empty author names for each paper in a ZOTERO Corpus
Returns
-------
Fails : When the author-name is empty, it fails
"""
papers = read(datapath)
for paper in papers:
self.assertNotEqual(len(paper.authors), 0, "Author list cannot be empty")
[docs] def test_authors_full(self):
"""
Tests for empty author_full names for each paper in a ZOTERO Corpus
Returns
-------
Fails : When the author_full names is empty, it fails.
"""
papers = read(datapath)
for paper in papers:
self.assertNotEqual(len(paper.authors_full), 0, "Author_full list cannot be empty")
[docs] def test_handle_date(self):
parser = ZoteroParser(datapath)
parser.parse()
date_list = ["January 23, 2015",
"2015-9",
"2015-9-23",
"09/23/2015",
"2015-09-23"]
for each_date in date_list:
self.assertEqual(2015, parser.handle_date(each_date),
"""
Date Not properly Formatted.
""")
[docs] def test_parse(self):
parser = ZoteroParser(datapath)
parser.parse()
# Check data types for the most common fields.
derror = "{0} should be {1}, but is {2}"
for e in parser.data:
if hasattr(e, 'date'):
self.assertIsInstance(e.date, int,
derror.format('date', 'int',
type(e.date)))
uppererr = "Author names should be uppercase"
if hasattr(e, 'authors_full'):
self.assertIsInstance(e.authors_full, list,
derror.format('authors_full', 'list',
type(e.authors_full)))
for a in e.authors_full:
self.assertTrue(a[0].isupper(), uppererr)
self.assertTrue(a[1].isupper(), uppererr)
if hasattr(e, 'authors_init'):
self.assertIsInstance(e.authors_init, list,
derror.format('authors_init', 'list',
type(e.authors_init)))
for a in e.authors_init:
self.assertTrue(a[0].isupper(), uppererr)
self.assertTrue(a[1].isupper(), uppererr)
if hasattr(e, 'journal'):
self.assertIsInstance(e.journal, unicode,
derror.format('journal', 'unicode',
type(e.journal)))
if hasattr(e, 'abstract'):
self.assertIsInstance(e.abstract, unicode,
derror.format('abstract', 'unicode',
type(e.abstract)))
if hasattr(e, 'authorKeywords'):
self.assertIsInstance(e.authorKeywords, list,
derror.format('authorKeywords', 'list',
type(e.authorKeywords)))
if hasattr(e, 'keywordsPlus'):
self.assertIsInstance(e.keywordsPlus, list,
derror.format('keywordsPlus', 'list',
type(e.keywordsPlus)))
if hasattr(e, 'doi'):
self.assertIsInstance(e.doi, unicode,
derror.format('doi', 'unicode',
type(e.doi)))
if hasattr(e, 'volume'):
self.assertIsInstance(e.volume, unicode,
derror.format('volume', 'unicode',
type(e.volume)))
if hasattr(e, 'title'):
self.assertIsInstance(e.title, unicode,
derror.format('title', 'unicode',
type(e.title)))
# Check integrity of tag-to-field mapping.
for tag, attr in parser.tags.iteritems():
self.assertFalse(hasattr(e, tag),
' '.join(['{0} should map to'.format(tag),
'{0}, but does not.'.format(attr)]))
# Check number of records.
N = len(parser.data)
self.assertEqual(N, 12, 'Expected 12 entries, found {0}.'.format(N))
if __name__ == '__main__':
unittest.main()