Need help? Have a feature request? Please check out the
tethne-users group
.
Source code for tethne.readers.dfr
"""
Methods for parsing JSTOR Data-for-Research datasets.
.. autosummary::
"""
import os
import xml.etree.ElementTree as ET
import re
from collections import Counter
from tethne import Paper, Corpus, Feature, FeatureSet, StreamingCorpus
from tethne.utilities import dict_from_node, strip_non_ascii, number
from tethne.readers.base import XMLParser
import iso8601
from io import BytesIO
from unidecode import unidecode
import codecs
import sys
PYTHON_3 = sys.version_info[0] == 3
if PYTHON_3:
unicode = str
[docs]class DfRParser(XMLParser):
entry_class = Paper
tags = {
'type': 'documentType',
'pubdate': 'date',
'journaltitle': 'journal',
'author': 'authors_full',
}
[docs] def open(self):
with codecs.open(self.path, 'r', encoding="utf-8") as f:
# JSTOR hasn't always represented ampersands correctly.
contents = re.sub('(&)(?!amp;)', lambda match: '&', f.read())
# self.root = ET.fromstring(contents)
# pattern = './/{elem}'.format(elem=self.entry_element)
# self.elements = self.root.findall(pattern)
self.iterator = ET.iterparse(BytesIO(contents.encode('utf-8')))
self.at_start = False
self.at_end = False
self.children = []
[docs] def handle_unicode(self, value):
# if type(value) is not str:
# value = unidecode(value)
return value
[docs] def handle_author(self, value):
# if type(value) is not str:
# value = unidecode(value)
lname = value.split(' ')
final = lname[-1].upper()
if final in ['JR.', 'III']:
aulast = lname[-2].upper() + " " + final.strip(".")
auinit = ' '.join(lname[0:-2]).replace('.','').strip().upper()
else:
aulast = final
auinit = ' '.join(lname[0:-1]).replace('.','').strip().upper()
return aulast, auinit
[docs] def postprocess_authors_full(self, entry):
if type(entry.authors_full) is not list:
entry.authors_full = [entry.authors_full]
[docs]class GramGenerator(object):
"""
Yields N-gram data from on-disk dataset, to make loading big datasets a bit
more memory-friendly.
Reusable, in the sense that :func:`.items`\, :func:`.items`\,
:func:`.keys`\, and :func:`.values` all return new :class:`.GramGenerator`
instances with the same path. This allows a :class:`.GramGenerator` to
sneakily pass as an ngrams dict in most practical situations.
"""
def __init__(self, path, elem, values=False, keys=False, ignore_hash=True):
"""
Parameters
----------
path : str
Path to unzipped JSTOR DfR folder containing N-grams (e.g.
'bigrams').
elem : str
Element in DfR dataset containing data of interest. E.g. 'bigrams'.
values : bool
If True, :func:`.next` returns only values. Otherwise, returns
(key,value) tuples.
"""
if not os.path.exists(path):
raise ValueError('No such file or directory')
self.path = path
self.elem = elem
if elem.endswith('s'):
self.elem_xml = elem[:-1]
else:
self.elem_xml = elem
self.ignore_hash = ignore_hash
self.files = os.listdir(os.path.join(path, elem))
self.N = len([ d for d in self.files if d.split('.')[-1] == 'XML' ])
self.i = 0
self.V = values
self.K = keys
if self.V and self.K:
raise ValueError('values and keys cannot both be true.')
def __len__(self):
return self.N
def __iter__(self):
return self
def __next__(self):
return self.next()
[docs] def next(self):
if self.i < self.N:
cur = int(self.i)
self.i += 1
return self._get(cur)
else:
raise StopIteration()
[docs] def items(self):
"""
Returns a :class:`GramGenerator` that produces key,value tuples.
"""
return GramGenerator(self.path, self.elem,
ignore_hash=self.ignore_hash)
[docs] def values(self):
"""
Returns a :class:`GramGenerator` that produces only values.
"""
return GramGenerator(self.path, self.elem, values=True,
ignore_hash=self.ignore_hash)
[docs] def keys(self):
"""
Returns a :class:`GramGenerator` that produces only keys.
"""
return GramGenerator(self.path, self.elem, keys=True,
ignore_hash=self.ignore_hash)
def __getitem__(self, key):
return self._get(key)
def _get(self, i):
"""
Retrieve data for the ith file in the dataset.
"""
with codecs.open(os.path.join(self.path, self.elem, self.files[i]), 'rb', encoding='utf-8') as f:
# JSTOR hasn't always produced valid XML.
contents = re.sub('(&)(?!amp;)', lambda match: '&', f.read())
# ElementTree does not support unicode strings.
root = ET.fromstring(contents.encode('utf-8'))
doi = root.attrib['id']
if self.K: # Keys only.
return doi
grams = []
for gram in root.findall(self.elem_xml):
text = gram.text.strip()
if type(text) is str:
text = text.decode('utf-8')
if ( not self.ignore_hash or '#' not in list(text) ):
c = ( text, number(gram.attrib['weight']) )
grams.append(c)
if self.V: # Values only.
return grams
return doi, grams # Default behavior.
def _get_citation_filename(basepath):
for fname in ["citations.xml", "citations.XML"]:
if os.path.exists(os.path.join(basepath, fname)):
return fname
[docs]def streaming_read(path, corpus=True, index_by='doi', parse_only=None,
**kwargs):
return read(path, corpus=corpus, index_by=index_by, parse_only=parse_only,
corpus_class=StreamingCorpus)
[docs]def read(path, corpus=True, index_by='doi', load_ngrams=True, parse_only=None,
corpus_class=Corpus, **kwargs):
"""
Yields :class:`.Paper` s from JSTOR DfR package.
Each :class:`.Paper` is tagged with an accession id for this
read/conversion.
Parameters
----------
filepath : string
Filepath to unzipped JSTOR DfR folder containing a citations.xml file.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Examples
--------
.. code-block:: python
>>> from tethne.readers import dfr
>>> papers = dfr.read("/Path/to/DfR")
"""
citationfname = _get_citation_filename(path)
features = {}
featureset_types = {}
# We need the primary index field in the parse results.
if parse_only:
parse_only.append(index_by)
papers = []
if citationfname: # Valid DfR dataset.
parser = DfRParser(os.path.join(path, citationfname))
papers += parser.parse(parse_only=parse_only)
else: # Possibly a directory containing several DfR datasets?
papers = []
# Search for DfR datasets in subdirectories.
for dirpath, dirnames, filenames in os.walk(path):
citationfname = _get_citation_filename(dirpath)
if citationfname:
subcorpus = read(dirpath, index_by=index_by,
parse_only=parse_only)
papers += subcorpus.papers
for featureset_name, featureset in subcorpus.features.iteritems():
if featureset_name not in features:
features[featureset_name] = {}
features[featureset_name].update(featureset.items())
featureset_types[featureset_name] = type(featureset)
load_ngrams = False
if len(papers) == 0:
raise ValueError('No DfR datasets found at %s' % path)
if corpus:
corpus = corpus_class(papers, index_by=index_by, **kwargs)
if load_ngrams: # Find and read N-gram data.
for sname in os.listdir(path):
fpath = os.path.join(path, sname) # Full path.
if os.path.isdir(fpath) and not sname.startswith('.'):
datafiles = [f for f in os.listdir(fpath)
if f.lower().endswith('xml')]
if len(datafiles) > 0:
features[sname] = ngrams(path, sname)
for featureset_name, featureset_values in features.iteritems():
if type(featureset_values) is dict:
fclass = featureset_types[featureset_name]
featureset_values = fclass(featureset_values)
corpus.features[featureset_name] = featureset_values
return corpus
return papers
[docs]def ngrams(path, elem, ignore_hash=True):
"""
Yields N-grams from a JSTOR DfR dataset.
Parameters
----------
path : string
Path to unzipped JSTOR DfR folder containing N-grams.
elem : string
Name of subdirectory containing N-grams. (e.g. 'bigrams').
ignore_hash : bool
If True, will exclude all N-grams that contain the hash '#' character.
Returns
-------
ngrams : :class:`.FeatureSet`
"""
grams = GramGenerator(path, elem, ignore_hash=ignore_hash)
return FeatureSet({k: Feature(f) for k, f in grams})
[docs]def tokenize(ngrams, min_tf=2, min_df=2, min_len=3, apply_stoplist=False):
"""
Builds a vocabulary, and replaces words with vocab indices.
Parameters
----------
ngrams : dict
Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.
apply_stoplist : bool
If True, will exclude all N-grams that contain words in the NLTK
stoplist.
Returns
-------
t_ngrams : dict
Tokenized ngrams, as doi:{i:count}.
vocab : dict
Vocabulary as i:term.
token_tf : :class:`.Counter`
Term counts for corpus, as i:count.
"""
vocab = {}
vocab_ = {}
word_tf = Counter()
word_df = Counter()
token_tf = Counter()
token_df = Counter()
t_ngrams = {}
# Get global word counts, first.
for grams in ngrams.values():
for g,c in grams:
word_tf[g] += c
word_df[g] += 1
if apply_stoplist:
stoplist = stopwords.words()
# Now tokenize.
for doi, grams in ngrams.iteritems():
t_ngrams[doi] = []
for g,c in grams:
ignore = False
# Ignore extremely rare words (probably garbage).
if word_tf[g] < min_tf or word_df[g] < min_df or len(g) < min_len:
ignore = True
# Stoplist.
elif apply_stoplist:
for w in g.split():
if w in stoplist:
ignore = True
if not ignore:
# Coerce unicode to string.
# if type(g) is str:
g = g.decode('utf-8')
# g = unidecode(g)
if g not in vocab.values():
i = len(vocab)
vocab[i] = g
vocab_[g] = i
else:
i = vocab_[g]
token_tf[i] += c
token_df[i] += 1
t_ngrams[doi].append( (i,c) )
return t_ngrams, vocab, token_tf
def _handle_paper(article):
"""
Yields a :class:`.Paper` from an article ET node.
Parameters
----------
article : Element
ElementTree Element 'article'.
Returns
-------
paper : :class:`.Paper`
"""
paper = Paper()
pdata = dict_from_node(article)
for key, value in pdata.iteritems():
datum = pdata[key]
if type(datum) is str:
datum = datum.decode('utf-8')
if type(datum) is unicode:
datum = datum.upper()
# datum = unidecode(datum).upper()
paper[key] = datum
# Handle author names.
adata = _handle_authors(pdata['author'])
paper.authors_init = zip(adata[0], adata[1])
# Handle pubdate.
paper['date'] = _handle_pubdate(pdata['pubdate'])
# Handle pagerange.
paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange'])
return paper
def _handle_pagerange(pagerange):
"""
Yields start and end pages from DfR pagerange field.
Parameters
----------
pagerange : str or unicode
DfR-style pagerange, e.g. "pp. 435-444".
Returns
-------
start : str
Start page.
end : str
End page.
"""
try:
pr = re.compile("pp\.\s([0-9]+)\-([0-9]+)")
start, end = re.findall(pr, pagerange)[0]
except IndexError:
start = end = 0
return unicode(start), unicode(end)
def _handle_pubdate(pubdate):
"""
Yields a date integer from DfR pubdate field.
"""
return int(pubdate[0:4])
def _handle_authors(authors):
"""
Yields aulast and auinit lists from value of authors node.
Parameters
----------
authors : list, str, or unicode
Value or values of 'author' element in DfR XML.
Returns
-------
aulast : list
A list of author surnames (string).
auinit : list
A list of author first-initials (string).
"""
aulast = []
auinit = []
if type(authors) is list:
for author in authors:
if type(author) is str:
author = author.decode('utf-8')
# try:
l,i = _handle_author(author)
aulast.append(l)
auinit.append(i)
# except ValueError:
# pass
elif type(authors) in [str, unicode]:
if type(authors) is str:
authors = authors.decode('utf-8')
# try:
l,i = _handle_author(authors)
aulast.append(l)
auinit.append(i)
# except ValueError:
# pass
else:
raise ValueError("authors must be a list or a string")
return aulast, auinit
def _handle_author(author):
"""
Yields aulast and auinit from an author's full name.
Parameters
----------
author : str or unicode
Author fullname, e.g. "Richard L. Nixon".
Returns
-------
aulast : str
Author surname.
auinit : str
Author first-initial.
"""
lname = author.split(' ')
try:
auinit = lname[0][0]
final = lname[-1].upper()
if final in ['JR.', 'III']:
aulast = lname[-2].upper() + " " + final.strip(".")
else:
aulast = final
except IndexError:
raise ValueError("malformed author name")
return aulast, auinit
def _dfr2paper_map():
"""
Defines the direct relationships between DfR article elements and
:class:`.Paper` fields.
Returns
-------
translator : dict
A 'translator' dictionary.
"""
translator = { 'doi': 'doi',
'title': 'atitle',
'journaltitle': 'jtitle',
'volume': 'volume',
'issue': 'issue' }
return translator
def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
"""
Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
Returns 'Unknown paper' if all id components are missing (None).
Parameters
----------
Kwargs : dict
A dictionary of keyword arguments.
aulast : string
Author surname.
auinit: string
Author initial(s).
date : string
Four-digit year.
jtitle : string
Title of the journal.
Returns
-------
ayj : string
Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
missing (None).
"""
if aulast is None:
aulast = ''
elif isinstance(aulast, list):
aulast = aulast[0]
if auinit is None:
auinit = ''
elif isinstance(auinit, list):
auinit = auinit[0]
if date is None:
date = ''
if jtitle is None:
jtitle = ''
ayj = aulast + ' ' + auinit + ' ' + unicode(date) + ' ' + jtitle
if ayj == ' ':
ayj = 'Unknown paper'
return ayj.upper()