"""
Methods for parsing JSTOR Data-for-Research datasets.
.. autosummary::
read
ngrams
read_corpus
from_dir
ngrams_from_dir
corpus_from_dir
"""
import sys
sys.path.append('/Users/erickpeirson/Dropbox/DigitalHPS/Scripts/tethne')
from ..classes import Paper, Corpus
import os
import xml.etree.ElementTree as ET
import re
from ..utilities import dict_from_node, strip_non_ascii
from nltk.corpus import stopwords
import uuid
from collections import Counter
from unidecode import unidecode
[docs]class GramGenerator(object):
"""
Yields N-gram data from on-disk dataset, to make loading big datasets a bit
more memory-friendly.
Reusable, in the sense that :func:`.items`\, :func:`.iteritems`\,
:func:`.keys`\, and :func:`.values` all return new :class:`.GramGenerator`
instances with the same path. This allows a :class:`.GramGenerator` to
sneakily pass as an ngrams dict in most practical situations.
"""
def __init__(self, path, elem, values=False, keys=False, ignore_hash=True):
"""
Parameters
----------
path : str
Path to unzipped JSTOR DfR folder containing N-grams (e.g.
'bigrams').
elem : str
Element in DfR dataset containing data of interest. E.g. 'bigrams'.
values : bool
If True, :func:`.next` returns only values. Otherwise, returns
(key,value) tuples.
"""
self.path = path
self.elem = elem
self.ignore_hash = ignore_hash
self.files = os.listdir(path)
self.N = len([ d for d in self.files if d.split('.')[-1] == 'XML' ])
self.i = 0
self.V = values
self.K = keys
if self.V and self.K:
raise ValueError('values and keys cannot both be true.')
def __len__(self):
return self.N
def __iter__(self):
return self
def __next__(self):
return self.next()
[docs] def next(self):
if self.i < self.N:
cur = int(self.i)
self.i += 1
return self._get(cur)
else:
raise StopIteration()
[docs] def items(self):
"""
Returns a :class:`GramGenerator` that produces key,value tuples.
"""
return GramGenerator(self.path, self.elem, ignore_hash=self.ignore_hash)
[docs] def iteritems(self):
"""
Returns a :class:`GramGenerator` that produces key,value tuples.
"""
return GramGenerator(self.path, self.elem, ignore_hash=self.ignore_hash)
[docs] def values(self):
"""
Returns a :class:`GramGenerator` that produces only values.
"""
return GramGenerator(self.path, self.elem, values=True,
ignore_hash=self.ignore_hash)
[docs] def keys(self):
"""
Returns a :class:`GramGenerator` that produces only keys.
"""
return GramGenerator(self.path, self.elem, keys=True,
ignore_hash=self.ignore_hash)
def __getitem__(self, key):
return self._get(key)
def _get(self, i):
"""
Retrieve data for the ith file in the dataset.
"""
root = ET.parse(self.path + "/" + self.files[i]).getroot()
doi = root.attrib['id']
if self.K: # Keys only.
return doi
grams = []
for gram in root.findall(self.elem):
text = unidecode(unicode(gram.text.strip()))
if ( not self.ignore_hash or '#' not in list(text) ):
c = ( text, int(gram.attrib['weight']) )
grams.append(c)
if self.V: # Values only.
return grams
return doi, grams # Default behavior.
[docs]def read(datapath):
"""
Yields :class:`.Paper` s from JSTOR DfR package.
Each :class:`.Paper` is tagged with an accession id for this
read/conversion.
Parameters
----------
filepath : string
Filepath to unzipped JSTOR DfR folder containing a citations.XML file.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Examples
--------
.. code-block:: python
>>> from tethne.readers import dfr
>>> papers = dfr.read("/Path/to/DfR")
"""
with open(datapath + "/citations.XML", mode='r') as f:
data = f.read()
data = data.replace('&', '&')
root = ET.fromstring(data)
accession = str(uuid.uuid4())
papers = []
for article in root:
paper = _handle_paper(article)
paper['accession'] = accession
papers.append(paper)
return papers
[docs]def read_corpus(path, features=None, exclude=None, **kwargs):
"""
Generate a :class:`.Corpus` from a single DfR dataset.
If ``features`` is provided (see below), will also load ngrams.
Parameters
----------
filepath : string
Filepath to unzipped JSTOR DfR folder containing a citations.XML file.
features : list
List of feature-grams (e.g. 'uni', 'bi', 'tri') to load from dataset.
exclude : list
Stoplist for feature-grams.
**kwargs
Use this to pass kwargs to :func:`.ngrams`.
Returns
-------
:class:`.Corpus`
Examples
--------
.. code-block:: python
>>> from nltk.corpus import stopwords # Get a stoplist.
>>> stoplist = stopwords.words()
>>> from tethne.readers import dfr
>>> MyCorpus = dfr.read_corpus("/Path/to/DfR", ['uni'], stoplist)
"""
papers = read(path)
grams = {}
if features is not None:
for feat in features:
grams[feat+'grams'] = ngrams(path, feat, **kwargs)
return Corpus(papers, features=grams, index_by='doi', exclude=exclude)
[docs]def from_dir(path):
"""
Convenience function for generating a list of :class:`.Paper` from a
directory of JSTOR DfR datasets.
Parameters
----------
path : string
Path to directory containing DfR dataset directories.
Returns
-------
papers : list
A list of :class:`.Paper` objects.
Raises
------
IOError
Invalid path.
Examples
--------
.. code-block:: python
>>> from tethne.readers import dfr
>>> papers = dfr.from_dir("/Path/to/datadir")
"""
papers = []
try:
files = os.listdir(path)
except IOError:
raise IOError("Invalid path.") # Ignore hidden files.
for f in files:
if not f.startswith('.') and os.path.isdir(path + "/" + f):
try:
papers += read(path + "/" + f)
except (IOError, UnboundLocalError): # Ignore directories that
pass # don't contain DfR data.
return papers
[docs]def ngrams_from_dir(path, N='uni', ignore_hash=True, mode='heavy'):
"""
Load ngrams from a directory of JSTOR DfR datasets.
Parameters
----------
path : string
Path to directory containing DfR dataset directories.
N : string
'uni', 'bi', 'tri', or 'quad'
ignore_hash : bool
If True, will exclude all N-grams that contain the hash '#' character.
mode : str
If 'heavy' (default), loads all data into memory and returns a dict. If
'light', returns a (somewhat) reusable :class:`.GramGenerator`\. See
:class:`.GramGenerator` for usage.
Returns
-------
ngrams : dict
Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.
Examples
--------
.. code-block:: python
>>> from tethne.readers import dfr
>>> ngrams = dfr.ngrams_from_dir("/Path/to/datadir", 'uni')
"""
grams = {}
try:
files = os.listdir(path)
except IOError:
raise IOError('Invalid path.')
for f in files:
if not f.startswith('.') and os.path.isdir(path + '/' + f):
try:
fpath = path + '/' + f
grams.update(ngrams(fpath, N, ignore_hash, mode))
except (IOError, UnboundLocalError, OSError):
pass
return grams
[docs]def corpus_from_dir(path, features=None, exclude=None, **kwargs):
"""
Generate a :class:`.Corpus` from a directory containing multiple DfR
datasets.
If ``features`` is provided (see below), will also load ngrams.
Parameters
----------
path : string
Path to directory containing DfR dataset directories.
features : list
List of feature-grams (e.g. 'uni', 'bi', 'tri') to load from dataset.
exclude : list
Stoplist for feature-grams.
**kwargs
Use this to pass kwargs to :func:`.ngrams`.
Returns
-------
:class:`.Corpus`
Examples
--------
.. code-block:: python
>>> from nltk.corpus import stopwords # Get a stoplist.
>>> stoplist = stopwords.words()
>>> from tethne.readers import dfr
>>> C = dfr.corpus_from_dir('/path/to/DfR/datasets', 'uni', stoplist)
"""
papers = from_dir(path)
grams = {}
if features is not None:
for feat in features:
grams[feat+'grams'] = ngrams_from_dir(path, **kwargs)
return Corpus(papers, features=grams, index_by='doi', exclude=exclude)
[docs]def ngrams(datapath, N='uni', ignore_hash=True, mode='heavy'):
"""
Yields N-grams from a JSTOR DfR dataset.
Parameters
----------
datapath : string
Path to unzipped JSTOR DfR folder containing N-grams (e.g. 'bigrams').
N : string
'uni', 'bi', 'tri', or 'quad'
ignore_hash : bool
If True, will exclude all N-grams that contain the hash '#' character.
mode : str
If 'heavy' (default), loads all data into memory and returns a dict. If
'light', returns a (somewhat) reusable :class:`.GramGenerator`\. See
:class:`.GramGenerator` for usage.
Returns
-------
ngrams : dict
Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.
Examples
--------
.. code-block:: python
>>> from tethne.readers import dfr
>>> trigrams = dfr.ngrams("/Path/to/DfR", N='tri')
"""
if N =='uni':
gram_dir = "/wordcounts"
elem = "wordcount"
else:
gram_dir = "/" + N + "grams"
elem = N + "gram"
gram_path = datapath + gram_dir
if mode == 'light':
return GramGenerator(gram_path, elem, ignore_hash=ignore_hash)
elif mode == 'heavy':
ngrams = {}
for file in os.listdir(gram_path):
if file.split('.')[-1] == 'XML':
root = ET.parse(gram_path + "/" + file).getroot()
doi = root.attrib['id']
grams = []
for gram in root.findall(elem):
text = unidecode(unicode(gram.text.strip()))
if ( not ignore_hash or '#' not in list(text) ):
c = ( text, int(gram.attrib['weight']) )
grams.append(c)
ngrams[doi] = grams
return ngrams
[docs]def tokenize(ngrams, min_tf=2, min_df=2, min_len=3, apply_stoplist=False):
"""
Builds a vocabulary, and replaces words with vocab indices.
Parameters
----------
ngrams : dict
Keys are paper DOIs, values are lists of (Ngram, frequency) tuples.
apply_stoplist : bool
If True, will exclude all N-grams that contain words in the NLTK
stoplist.
Returns
-------
t_ngrams : dict
Tokenized ngrams, as doi:{i:count}.
vocab : dict
Vocabulary as i:term.
token_tf : :class:`.Counter`
Term counts for corpus, as i:count.
"""
vocab = {}
vocab_ = {}
word_tf = Counter()
word_df = Counter()
token_tf = Counter()
token_df = Counter()
t_ngrams = {}
# Get global word counts, first.
for grams in ngrams.values():
for g,c in grams:
word_tf[g] += c
word_df[g] += 1
if apply_stoplist:
stoplist = stopwords.words()
# Now tokenize.
for doi,grams in ngrams.iteritems():
t_ngrams[doi] = []
for g,c in grams:
ignore = False
# Ignore extremely rare words (probably garbage).
if word_tf[g] < min_tf or word_df[g] < min_df or len(g) < min_len:
ignore = True
# Stoplist.
elif apply_stoplist:
for w in g.split():
if w in stoplist:
ignore = True
if not ignore:
# Coerce unicode to string.
if type(g) is str:
g = unicode(g)
g = unidecode(g)
if g not in vocab.values():
i = len(vocab)
vocab[i] = g
vocab_[g] = i
else:
i = vocab_[g]
token_tf[i] += c
token_df[i] += 1
t_ngrams[doi].append( (i,c) )
return t_ngrams, vocab, token_tf
def _handle_paper(article):
"""
Yields a :class:`.Paper` from an article ET node.
Parameters
----------
article : Element
ElementTree Element 'article'.
Returns
-------
paper : :class:`.Paper`
"""
paper = Paper()
pdata = dict_from_node(article)
# Direct mappings.
translator = _dfr2paper_map()
for key, value in translator.iteritems():
if key in pdata: # Article may not have all keys of interest.
datum = pdata[key]
if type(datum) is str:
datum = unicode(datum)
if type(datum) is unicode:
datum = unidecode(datum).upper()
try: # For now, ignore weird types that come through in datum.
paper[value] = datum
except ValueError:
pass
# Handle author names.
paper['aulast'], paper['auinit'] = _handle_authors(pdata['author'])
# Handle pubdate.
paper['date'] = _handle_pubdate(pdata['pubdate'])
# Handle pagerange.
paper['spage'], paper['epage'] = _handle_pagerange(pdata['pagerange'])
# Generate ayjid.
try:
paper['ayjid'] = _create_ayjid(paper['aulast'][0], paper['auinit'][0], \
paper['date'], paper['jtitle'])
except IndexError: # Article may not have authors.
pass
return paper
def _handle_pagerange(pagerange):
"""
Yields start and end pages from DfR pagerange field.
Parameters
----------
pagerange : str or unicode
DfR-style pagerange, e.g. "pp. 435-444".
Returns
-------
start : str
Start page.
end : str
End page.
"""
try:
pr = re.compile("pp\.\s([0-9]+)\-([0-9]+)")
start, end = re.findall(pr, pagerange)[0]
except IndexError:
start = end = 0
return str(start), str(end)
def _handle_pubdate(pubdate):
"""
Yields a date integer from DfR pubdate field.
"""
return int(pubdate[0:4])
def _handle_authors(authors):
"""
Yields aulast and auinit lists from value of authors node.
Parameters
----------
authors : list, str, or unicode
Value or values of 'author' element in DfR XML.
Returns
-------
aulast : list
A list of author surnames (string).
auinit : list
A list of author first-initials (string).
"""
aulast = []
auinit = []
if type(authors) is list:
for author in authors:
if type(author) is str:
author = unicode(author)
author = unidecode(author)
try:
l,i = _handle_author(author)
aulast.append(l)
auinit.append(i)
except ValueError:
pass
elif type(authors) is str or type(authors) is unicode:
if type(authors) is str:
authors = unicode(authors)
author = unidecode(authors)
try:
l,i = _handle_author(author)
aulast.append(l)
auinit.append(i)
except ValueError:
pass
else:
raise ValueError("authors must be a list or a string")
return aulast, auinit
def _handle_author(author):
"""
Yields aulast and auinit from an author's full name.
Parameters
----------
author : str or unicode
Author fullname, e.g. "Richard L. Nixon".
Returns
-------
aulast : str
Author surname.
auinit : str
Author first-initial.
"""
lname = author.split(' ')
try:
auinit = lname[0][0]
final = lname[-1].upper()
if final in ['JR.', 'III']:
aulast = lname[-2].upper() + " " + final.strip(".")
else:
aulast = final
except IndexError:
raise ValueError("malformed author name")
return aulast, auinit
def _dfr2paper_map():
"""
Defines the direct relationships between DfR article elements and
:class:`.Paper` fields.
Returns
-------
translator : dict
A 'translator' dictionary.
"""
translator = { 'doi': 'doi',
'title': 'atitle',
'journaltitle': 'jtitle',
'volume': 'volume',
'issue': 'issue' }
return translator
def _create_ayjid(aulast=None, auinit=None, date=None, jtitle=None, **kwargs):
"""
Convert aulast, auinit, and jtitle into the fuzzy identifier ayjid
Returns 'Unknown paper' if all id components are missing (None).
Parameters
----------
Kwargs : dict
A dictionary of keyword arguments.
aulast : string
Author surname.
auinit: string
Author initial(s).
date : string
Four-digit year.
jtitle : string
Title of the journal.
Returns
-------
ayj : string
Fuzzy identifier ayjid, or 'Unknown paper' if all id components are
missing (None).
"""
if aulast is None:
aulast = ''
elif isinstance(aulast, list):
aulast = aulast[0]
if auinit is None:
auinit = ''
elif isinstance(auinit, list):
auinit = auinit[0]
if date is None:
date = ''
if jtitle is None:
jtitle = ''
ayj = aulast + ' ' + auinit + ' ' + str(date) + ' ' + jtitle
if ayj == ' ':
ayj = 'Unknown paper'
return ayj.upper()